@agentv/core 4.17.1 → 4.19.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -341,7 +341,7 @@ type ArgsMatchMode = 'exact' | 'ignore' | 'subset' | 'superset';
341
341
  /**
342
342
  * Configuration for tool-trajectory evaluator.
343
343
  */
344
- interface ToolTrajectoryEvaluatorConfig {
344
+ interface ToolTrajectoryGraderConfig {
345
345
  readonly name: string;
346
346
  readonly type: 'tool-trajectory';
347
347
  /** Matching mode */
@@ -355,7 +355,7 @@ interface ToolTrajectoryEvaluatorConfig {
355
355
  readonly required?: boolean | number;
356
356
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
357
357
  readonly min_score?: number;
358
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
358
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
359
359
  readonly negate?: boolean;
360
360
  /** Default argument matching mode for all expected items (defaults to 'exact') */
361
361
  readonly argsMatch?: ArgsMatchMode | readonly string[];
@@ -539,9 +539,9 @@ declare function isJsonValue(value: unknown): value is JsonValue;
539
539
  * - Either content (string or array of objects) OR tool_calls (for assistant messages)
540
540
  */
541
541
  declare function isTestMessage(value: unknown): value is TestMessage;
542
- declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
543
- type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
544
- declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
542
+ declare const GRADER_KIND_VALUES: readonly ["code-grader", "llm-grader", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
543
+ type GraderKind = (typeof GRADER_KIND_VALUES)[number];
544
+ declare function isGraderKind(value: unknown): value is GraderKind;
545
545
  /**
546
546
  * Configuration for enabling target access in code-grader evaluators.
547
547
  * When present, the runtime will start a local proxy server that allows
@@ -697,7 +697,7 @@ type WorkspaceConfig = {
697
697
  * relative paths from their own directory, not the eval file's directory. */
698
698
  readonly workspaceFileDir?: string;
699
699
  };
700
- type CodeEvaluatorConfig = {
700
+ type CodeGraderConfig = {
701
701
  readonly name: string;
702
702
  readonly type: 'code-grader';
703
703
  readonly command: readonly string[];
@@ -710,7 +710,7 @@ type CodeEvaluatorConfig = {
710
710
  readonly required?: boolean | number;
711
711
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
712
712
  readonly min_score?: number;
713
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
713
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
714
714
  readonly negate?: boolean;
715
715
  /** Pass-through configuration for the code-grader (any unrecognized YAML properties) */
716
716
  readonly config?: JsonObject;
@@ -739,7 +739,7 @@ type ContentPreprocessorConfig = {
739
739
  /** Resolved absolute path for the command script (last argv element) */
740
740
  readonly resolvedCommand?: readonly string[];
741
741
  };
742
- type LlmGraderEvaluatorConfig = {
742
+ type LlmGraderConfig = {
743
743
  readonly name: string;
744
744
  readonly type: 'llm-grader';
745
745
  /** Text prompt (inline or file path) or executable script config */
@@ -754,7 +754,7 @@ type LlmGraderEvaluatorConfig = {
754
754
  readonly required?: boolean | number;
755
755
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
756
756
  readonly min_score?: number;
757
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
757
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
758
758
  readonly negate?: boolean;
759
759
  /** Optional target override for this grader (uses a named LLM target from targets.yaml). */
760
760
  readonly target?: string;
@@ -767,8 +767,6 @@ type LlmGraderEvaluatorConfig = {
767
767
  /** Optional content preprocessors for ContentFile blocks in assistant output */
768
768
  readonly preprocessors?: readonly ContentPreprocessorConfig[];
769
769
  };
770
- /** @deprecated Use `LlmGraderEvaluatorConfig` instead */
771
- type LlmJudgeEvaluatorConfig = LlmGraderEvaluatorConfig;
772
770
  /**
773
771
  * Score range definition for analytic rubric scoring.
774
772
  * Each range maps an integer score band (0-10) to an outcome description.
@@ -830,16 +828,16 @@ type CompositeAggregatorConfig = {
830
828
  readonly type: 'threshold';
831
829
  readonly threshold: number;
832
830
  };
833
- type CompositeEvaluatorConfig = {
831
+ type CompositeGraderConfig = {
834
832
  readonly name: string;
835
833
  readonly type: 'composite';
836
- readonly assertions: readonly EvaluatorConfig[];
834
+ readonly assertions: readonly GraderConfig[];
837
835
  readonly aggregator: CompositeAggregatorConfig;
838
836
  readonly weight?: number;
839
837
  readonly required?: boolean | number;
840
838
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
841
839
  readonly min_score?: number;
842
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
840
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
843
841
  readonly negate?: boolean;
844
842
  };
845
843
  /**
@@ -874,7 +872,7 @@ type FieldConfig = {
874
872
  /**
875
873
  * Configuration for the field-accuracy evaluator.
876
874
  */
877
- type FieldAccuracyEvaluatorConfig = {
875
+ type FieldAccuracyGraderConfig = {
878
876
  readonly name: string;
879
877
  readonly type: 'field-accuracy';
880
878
  /** Fields to compare between candidate and expected */
@@ -885,14 +883,14 @@ type FieldAccuracyEvaluatorConfig = {
885
883
  readonly required?: boolean | number;
886
884
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
887
885
  readonly min_score?: number;
888
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
886
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
889
887
  readonly negate?: boolean;
890
888
  };
891
889
  /**
892
890
  * Configuration for the latency evaluator.
893
891
  * Checks execution duration against a threshold.
894
892
  */
895
- type LatencyEvaluatorConfig = {
893
+ type LatencyGraderConfig = {
896
894
  readonly name: string;
897
895
  readonly type: 'latency';
898
896
  /** Maximum allowed duration in milliseconds */
@@ -901,14 +899,14 @@ type LatencyEvaluatorConfig = {
901
899
  readonly required?: boolean | number;
902
900
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
903
901
  readonly min_score?: number;
904
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
902
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
905
903
  readonly negate?: boolean;
906
904
  };
907
905
  /**
908
906
  * Configuration for the cost evaluator.
909
907
  * Checks execution cost against a budget.
910
908
  */
911
- type CostEvaluatorConfig = {
909
+ type CostGraderConfig = {
912
910
  readonly name: string;
913
911
  readonly type: 'cost';
914
912
  /** Maximum allowed cost in USD */
@@ -917,14 +915,14 @@ type CostEvaluatorConfig = {
917
915
  readonly required?: boolean | number;
918
916
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
919
917
  readonly min_score?: number;
920
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
918
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
921
919
  readonly negate?: boolean;
922
920
  };
923
921
  /**
924
922
  * Configuration for the token-usage evaluator.
925
923
  * Checks provider-reported token usage against configured limits.
926
924
  */
927
- type TokenUsageEvaluatorConfig = {
925
+ type TokenUsageGraderConfig = {
928
926
  readonly name: string;
929
927
  readonly type: 'token-usage';
930
928
  /** Maximum allowed total tokens (input + output + cached, when present) */
@@ -937,7 +935,7 @@ type TokenUsageEvaluatorConfig = {
937
935
  readonly required?: boolean | number;
938
936
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
939
937
  readonly min_score?: number;
940
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
938
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
941
939
  readonly negate?: boolean;
942
940
  };
943
941
  /**
@@ -945,7 +943,7 @@ type TokenUsageEvaluatorConfig = {
945
943
  * Provides declarative threshold-based checks on execution metrics.
946
944
  * Only specified thresholds are checked; omitted ones are ignored.
947
945
  */
948
- type ExecutionMetricsEvaluatorConfig = {
946
+ type ExecutionMetricsGraderConfig = {
949
947
  readonly name: string;
950
948
  readonly type: 'execution-metrics';
951
949
  /** Maximum allowed number of tool calls */
@@ -966,14 +964,14 @@ type ExecutionMetricsEvaluatorConfig = {
966
964
  readonly required?: boolean | number;
967
965
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
968
966
  readonly min_score?: number;
969
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
967
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
970
968
  readonly negate?: boolean;
971
969
  };
972
970
  /**
973
971
  * Configuration for the contains assertion evaluator.
974
972
  * Checks whether the candidate output contains a specified substring.
975
973
  */
976
- type ContainsEvaluatorConfig = {
974
+ type ContainsGraderConfig = {
977
975
  readonly name: string;
978
976
  readonly type: 'contains';
979
977
  readonly value: string;
@@ -981,14 +979,14 @@ type ContainsEvaluatorConfig = {
981
979
  readonly required?: boolean | number;
982
980
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
983
981
  readonly min_score?: number;
984
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
982
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
985
983
  readonly negate?: boolean;
986
984
  };
987
985
  /**
988
986
  * Configuration for the contains_any assertion evaluator.
989
987
  * Checks whether the candidate output contains ANY of the specified substrings.
990
988
  */
991
- type ContainsAnyEvaluatorConfig = {
989
+ type ContainsAnyGraderConfig = {
992
990
  readonly name: string;
993
991
  readonly type: 'contains-any';
994
992
  readonly value: readonly string[];
@@ -996,14 +994,14 @@ type ContainsAnyEvaluatorConfig = {
996
994
  readonly required?: boolean | number;
997
995
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
998
996
  readonly min_score?: number;
999
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
997
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
1000
998
  readonly negate?: boolean;
1001
999
  };
1002
1000
  /**
1003
1001
  * Configuration for the contains_all assertion evaluator.
1004
1002
  * Checks whether the candidate output contains ALL of the specified substrings.
1005
1003
  */
1006
- type ContainsAllEvaluatorConfig = {
1004
+ type ContainsAllGraderConfig = {
1007
1005
  readonly name: string;
1008
1006
  readonly type: 'contains-all';
1009
1007
  readonly value: readonly string[];
@@ -1011,14 +1009,14 @@ type ContainsAllEvaluatorConfig = {
1011
1009
  readonly required?: boolean | number;
1012
1010
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1013
1011
  readonly min_score?: number;
1014
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1012
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
1015
1013
  readonly negate?: boolean;
1016
1014
  };
1017
1015
  /**
1018
1016
  * Configuration for the icontains assertion evaluator.
1019
1017
  * Case-insensitive check whether the candidate output contains a specified substring.
1020
1018
  */
1021
- type IcontainsEvaluatorConfig = {
1019
+ type IcontainsGraderConfig = {
1022
1020
  readonly name: string;
1023
1021
  readonly type: 'icontains';
1024
1022
  readonly value: string;
@@ -1026,14 +1024,14 @@ type IcontainsEvaluatorConfig = {
1026
1024
  readonly required?: boolean | number;
1027
1025
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1028
1026
  readonly min_score?: number;
1029
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1027
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
1030
1028
  readonly negate?: boolean;
1031
1029
  };
1032
1030
  /**
1033
1031
  * Configuration for the icontains_any assertion evaluator.
1034
1032
  * Case-insensitive check whether the candidate output contains ANY of the specified substrings.
1035
1033
  */
1036
- type IcontainsAnyEvaluatorConfig = {
1034
+ type IcontainsAnyGraderConfig = {
1037
1035
  readonly name: string;
1038
1036
  readonly type: 'icontains-any';
1039
1037
  readonly value: readonly string[];
@@ -1041,14 +1039,14 @@ type IcontainsAnyEvaluatorConfig = {
1041
1039
  readonly required?: boolean | number;
1042
1040
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1043
1041
  readonly min_score?: number;
1044
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1042
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
1045
1043
  readonly negate?: boolean;
1046
1044
  };
1047
1045
  /**
1048
1046
  * Configuration for the icontains_all assertion evaluator.
1049
1047
  * Case-insensitive check whether the candidate output contains ALL of the specified substrings.
1050
1048
  */
1051
- type IcontainsAllEvaluatorConfig = {
1049
+ type IcontainsAllGraderConfig = {
1052
1050
  readonly name: string;
1053
1051
  readonly type: 'icontains-all';
1054
1052
  readonly value: readonly string[];
@@ -1056,14 +1054,14 @@ type IcontainsAllEvaluatorConfig = {
1056
1054
  readonly required?: boolean | number;
1057
1055
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1058
1056
  readonly min_score?: number;
1059
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1057
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
1060
1058
  readonly negate?: boolean;
1061
1059
  };
1062
1060
  /**
1063
1061
  * Configuration for the starts_with assertion evaluator.
1064
1062
  * Checks whether the candidate output starts with a specified string (both trimmed).
1065
1063
  */
1066
- type StartsWithEvaluatorConfig = {
1064
+ type StartsWithGraderConfig = {
1067
1065
  readonly name: string;
1068
1066
  readonly type: 'starts-with';
1069
1067
  readonly value: string;
@@ -1071,14 +1069,14 @@ type StartsWithEvaluatorConfig = {
1071
1069
  readonly required?: boolean | number;
1072
1070
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1073
1071
  readonly min_score?: number;
1074
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1072
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
1075
1073
  readonly negate?: boolean;
1076
1074
  };
1077
1075
  /**
1078
1076
  * Configuration for the ends_with assertion evaluator.
1079
1077
  * Checks whether the candidate output ends with a specified string (both trimmed).
1080
1078
  */
1081
- type EndsWithEvaluatorConfig = {
1079
+ type EndsWithGraderConfig = {
1082
1080
  readonly name: string;
1083
1081
  readonly type: 'ends-with';
1084
1082
  readonly value: string;
@@ -1086,14 +1084,14 @@ type EndsWithEvaluatorConfig = {
1086
1084
  readonly required?: boolean | number;
1087
1085
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1088
1086
  readonly min_score?: number;
1089
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1087
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
1090
1088
  readonly negate?: boolean;
1091
1089
  };
1092
1090
  /**
1093
1091
  * Configuration for the regex assertion evaluator.
1094
1092
  * Checks whether the candidate output matches a regular expression pattern.
1095
1093
  */
1096
- type RegexEvaluatorConfig = {
1094
+ type RegexGraderConfig = {
1097
1095
  readonly name: string;
1098
1096
  readonly type: 'regex';
1099
1097
  readonly value: string;
@@ -1103,28 +1101,28 @@ type RegexEvaluatorConfig = {
1103
1101
  readonly required?: boolean | number;
1104
1102
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1105
1103
  readonly min_score?: number;
1106
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1104
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
1107
1105
  readonly negate?: boolean;
1108
1106
  };
1109
1107
  /**
1110
1108
  * Configuration for the is_json assertion evaluator.
1111
1109
  * Checks whether the candidate output is valid JSON.
1112
1110
  */
1113
- type IsJsonEvaluatorConfig = {
1111
+ type IsJsonGraderConfig = {
1114
1112
  readonly name: string;
1115
1113
  readonly type: 'is-json';
1116
1114
  readonly weight?: number;
1117
1115
  readonly required?: boolean | number;
1118
1116
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1119
1117
  readonly min_score?: number;
1120
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1118
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
1121
1119
  readonly negate?: boolean;
1122
1120
  };
1123
1121
  /**
1124
1122
  * Configuration for the equals assertion evaluator.
1125
1123
  * Checks whether the candidate output exactly equals a specified string.
1126
1124
  */
1127
- type EqualsEvaluatorConfig = {
1125
+ type EqualsGraderConfig = {
1128
1126
  readonly name: string;
1129
1127
  readonly type: 'equals';
1130
1128
  readonly value: string;
@@ -1132,7 +1130,7 @@ type EqualsEvaluatorConfig = {
1132
1130
  readonly required?: boolean | number;
1133
1131
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1134
1132
  readonly min_score?: number;
1135
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1133
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
1136
1134
  readonly negate?: boolean;
1137
1135
  };
1138
1136
  /**
@@ -1147,7 +1145,7 @@ type RubricsEvaluatorConfig = {
1147
1145
  readonly required?: boolean | number;
1148
1146
  /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1149
1147
  readonly min_score?: number;
1150
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1148
+ /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
1151
1149
  readonly negate?: boolean;
1152
1150
  };
1153
1151
  /**
@@ -1156,7 +1154,7 @@ type RubricsEvaluatorConfig = {
1156
1154
  * Tool-name resolution is automatic based on the provider kind.
1157
1155
  * For providers not covered by the built-in mapping, use a code-grader.
1158
1156
  */
1159
- type SkillTriggerEvaluatorConfig = {
1157
+ type SkillTriggerGraderConfig = {
1160
1158
  readonly name: string;
1161
1159
  readonly type: 'skill-trigger';
1162
1160
  /** The skill name to check for (case-sensitive substring match) */
@@ -1182,7 +1180,7 @@ type InlineAssertEvaluatorConfig = {
1182
1180
  readonly min_score?: number;
1183
1181
  readonly negate?: boolean;
1184
1182
  };
1185
- type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | SkillTriggerEvaluatorConfig | ContainsEvaluatorConfig | ContainsAnyEvaluatorConfig | ContainsAllEvaluatorConfig | IcontainsEvaluatorConfig | IcontainsAnyEvaluatorConfig | IcontainsAllEvaluatorConfig | StartsWithEvaluatorConfig | EndsWithEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig | InlineAssertEvaluatorConfig;
1183
+ type GraderConfig = CodeGraderConfig | LlmGraderConfig | CompositeGraderConfig | ToolTrajectoryGraderConfig | FieldAccuracyGraderConfig | LatencyGraderConfig | CostGraderConfig | TokenUsageGraderConfig | ExecutionMetricsGraderConfig | SkillTriggerGraderConfig | ContainsGraderConfig | ContainsAnyGraderConfig | ContainsAllGraderConfig | IcontainsGraderConfig | IcontainsAnyGraderConfig | IcontainsAllGraderConfig | StartsWithGraderConfig | EndsWithGraderConfig | RegexGraderConfig | IsJsonGraderConfig | EqualsGraderConfig | RubricsEvaluatorConfig | InlineAssertEvaluatorConfig;
1186
1184
  /**
1187
1185
  * A single turn in a multi-turn conversation evaluation.
1188
1186
  * Each turn is a user message. The runner generates the assistant response.
@@ -1193,7 +1191,7 @@ interface ConversationTurn {
1193
1191
  /** Reference assistant response for grading (NOT carried forward — actual LLM response is used) */
1194
1192
  readonly expected_output?: TestMessageContent;
1195
1193
  /** Per-turn assertions. Strings become rubric criteria via shorthand. */
1196
- readonly assertions?: readonly (string | EvaluatorConfig)[];
1194
+ readonly assertions?: readonly (string | GraderConfig)[];
1197
1195
  }
1198
1196
  /**
1199
1197
  * Conversation evaluation mode.
@@ -1228,8 +1226,8 @@ interface EvalTest {
1228
1226
  readonly reference_answer?: string;
1229
1227
  readonly file_paths: readonly string[];
1230
1228
  readonly criteria: string;
1231
- readonly evaluator?: EvaluatorKind;
1232
- readonly assertions?: readonly EvaluatorConfig[];
1229
+ readonly evaluator?: GraderKind;
1230
+ readonly assertions?: readonly GraderConfig[];
1233
1231
  /** Suite-level preprocessors used by the implicit default llm-grader. */
1234
1232
  readonly preprocessors?: readonly ContentPreprocessorConfig[];
1235
1233
  /** Workspace configuration (merged from suite-level and case-level) */
@@ -1293,7 +1291,7 @@ interface TrialResult {
1293
1291
  readonly attempt: number;
1294
1292
  readonly score: number;
1295
1293
  readonly verdict: EvaluationVerdict;
1296
- readonly scores?: readonly EvaluatorResult[];
1294
+ readonly scores?: readonly GraderResult[];
1297
1295
  readonly error?: string;
1298
1296
  readonly costUsd?: number;
1299
1297
  /** Primary classification for this trial attempt */
@@ -1359,7 +1357,7 @@ interface ExecutionError {
1359
1357
  */
1360
1358
  type FailOnError = boolean;
1361
1359
  /**
1362
- * Evaluator scorecard for a single eval case run.
1360
+ * Grader scorecard for a single eval case run.
1363
1361
  */
1364
1362
  interface EvaluationResult {
1365
1363
  readonly timestamp: string;
@@ -1390,7 +1388,7 @@ interface EvaluationResult {
1390
1388
  readonly lm?: JsonObject;
1391
1389
  readonly evaluator?: JsonObject;
1392
1390
  };
1393
- readonly scores?: readonly EvaluatorResult[];
1391
+ readonly scores?: readonly GraderResult[];
1394
1392
  readonly error?: string;
1395
1393
  /** Lightweight summary of the execution trace (always included when available) */
1396
1394
  readonly trace?: TraceSummary;
@@ -1433,9 +1431,9 @@ interface EvaluationResult {
1433
1431
  readonly executionError?: ExecutionError;
1434
1432
  }
1435
1433
  type EvaluationVerdict = 'pass' | 'fail' | 'skip';
1436
- interface EvaluatorResult {
1434
+ interface GraderResult {
1437
1435
  readonly name: string;
1438
- readonly type: EvaluatorKind;
1436
+ readonly type: GraderKind;
1439
1437
  readonly score: number;
1440
1438
  readonly weight?: number;
1441
1439
  readonly verdict?: EvaluationVerdict;
@@ -1444,7 +1442,7 @@ interface EvaluatorResult {
1444
1442
  readonly input?: JsonObject;
1445
1443
  /** Target name used for grading (e.g., the LLM provider name). */
1446
1444
  readonly target?: string;
1447
- readonly scores?: readonly EvaluatorResult[];
1445
+ readonly scores?: readonly GraderResult[];
1448
1446
  /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts). */
1449
1447
  readonly details?: JsonObject;
1450
1448
  /** Token usage from LLM calls made by this evaluator (optional). */
@@ -1457,156 +1455,558 @@ interface EvaluatorResult {
1457
1455
  readonly endedAt?: string;
1458
1456
  }
1459
1457
 
1460
- declare const MetadataSchema: z.ZodObject<{
1461
- name: z.ZodString;
1462
- description: z.ZodOptional<z.ZodString>;
1463
- version: z.ZodOptional<z.ZodString>;
1464
- author: z.ZodOptional<z.ZodString>;
1465
- tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
1466
- license: z.ZodOptional<z.ZodString>;
1467
- requires: z.ZodOptional<z.ZodObject<{
1468
- agentv: z.ZodOptional<z.ZodString>;
1469
- }, "strip", z.ZodTypeAny, {
1470
- agentv?: string | undefined;
1458
+ /**
1459
+ * Strict normalized schema for CLI target configuration.
1460
+ * This is the final validated shape after environment variable resolution
1461
+ * and internal field normalization.
1462
+ *
1463
+ * Uses .strict() to reject unknown properties, ensuring configuration
1464
+ * errors are caught early rather than silently ignored.
1465
+ *
1466
+ * @example
1467
+ * ```typescript
1468
+ * const config: CliNormalizedConfig = {
1469
+ * command: 'agent run {PROMPT}',
1470
+ * timeoutMs: 120000,
1471
+ * verbose: true,
1472
+ * };
1473
+ * CliTargetConfigSchema.parse(config); // Validates the normalized config
1474
+ * ```
1475
+ */
1476
+ declare const CliTargetConfigSchema: z.ZodObject<{
1477
+ command: z.ZodString;
1478
+ filesFormat: z.ZodOptional<z.ZodString>;
1479
+ cwd: z.ZodOptional<z.ZodString>;
1480
+ timeoutMs: z.ZodOptional<z.ZodNumber>;
1481
+ healthcheck: z.ZodOptional<z.ZodUnion<[z.ZodObject<{
1482
+ url: z.ZodString;
1483
+ timeoutMs: z.ZodOptional<z.ZodNumber>;
1484
+ }, "strict", z.ZodTypeAny, {
1485
+ url: string;
1486
+ timeoutMs?: number | undefined;
1471
1487
  }, {
1472
- agentv?: string | undefined;
1473
- }>>;
1474
- }, "strip", z.ZodTypeAny, {
1475
- name: string;
1476
- description?: string | undefined;
1477
- version?: string | undefined;
1478
- author?: string | undefined;
1479
- tags?: string[] | undefined;
1480
- license?: string | undefined;
1481
- requires?: {
1482
- agentv?: string | undefined;
1488
+ url: string;
1489
+ timeoutMs?: number | undefined;
1490
+ }>, z.ZodObject<{
1491
+ command: z.ZodString;
1492
+ cwd: z.ZodOptional<z.ZodString>;
1493
+ timeoutMs: z.ZodOptional<z.ZodNumber>;
1494
+ }, "strict", z.ZodTypeAny, {
1495
+ command: string;
1496
+ timeoutMs?: number | undefined;
1497
+ cwd?: string | undefined;
1498
+ }, {
1499
+ command: string;
1500
+ timeoutMs?: number | undefined;
1501
+ cwd?: string | undefined;
1502
+ }>]>>;
1503
+ verbose: z.ZodOptional<z.ZodBoolean>;
1504
+ keepTempFiles: z.ZodOptional<z.ZodBoolean>;
1505
+ }, "strict", z.ZodTypeAny, {
1506
+ command: string;
1507
+ timeoutMs?: number | undefined;
1508
+ cwd?: string | undefined;
1509
+ verbose?: boolean | undefined;
1510
+ healthcheck?: {
1511
+ url: string;
1512
+ timeoutMs?: number | undefined;
1513
+ } | {
1514
+ command: string;
1515
+ timeoutMs?: number | undefined;
1516
+ cwd?: string | undefined;
1483
1517
  } | undefined;
1518
+ filesFormat?: string | undefined;
1519
+ keepTempFiles?: boolean | undefined;
1484
1520
  }, {
1485
- name: string;
1486
- description?: string | undefined;
1487
- version?: string | undefined;
1488
- author?: string | undefined;
1489
- tags?: string[] | undefined;
1490
- license?: string | undefined;
1491
- requires?: {
1492
- agentv?: string | undefined;
1521
+ command: string;
1522
+ timeoutMs?: number | undefined;
1523
+ cwd?: string | undefined;
1524
+ verbose?: boolean | undefined;
1525
+ healthcheck?: {
1526
+ url: string;
1527
+ timeoutMs?: number | undefined;
1528
+ } | {
1529
+ command: string;
1530
+ timeoutMs?: number | undefined;
1531
+ cwd?: string | undefined;
1493
1532
  } | undefined;
1533
+ filesFormat?: string | undefined;
1534
+ keepTempFiles?: boolean | undefined;
1494
1535
  }>;
1495
- type EvalMetadata = z.infer<typeof MetadataSchema>;
1496
-
1497
- declare const DEFAULT_EVAL_PATTERNS: readonly string[];
1498
- type ExecutionDefaults = {
1499
- readonly verbose?: boolean;
1500
- readonly keep_workspaces?: boolean;
1501
- readonly otel_file?: string;
1502
- readonly export_otel?: boolean;
1503
- readonly otel_backend?: string;
1504
- readonly otel_capture_content?: boolean;
1505
- readonly otel_group_turns?: boolean;
1506
- readonly pool_workspaces?: boolean;
1507
- readonly pool_slots?: number;
1508
- };
1509
- type ResultsExportConfig = {
1510
- readonly repo: string;
1511
- readonly path: string;
1512
- readonly auto_push?: boolean;
1513
- readonly branch_prefix?: string;
1514
- };
1515
- type AgentVConfig$1 = {
1516
- readonly required_version?: string;
1517
- readonly eval_patterns?: readonly string[];
1518
- readonly execution?: ExecutionDefaults;
1519
- readonly results?: {
1520
- readonly export?: ResultsExportConfig;
1521
- };
1522
- };
1523
- /**
1524
- * Load optional .agentv/config.yaml configuration file.
1525
- * Searches from eval file directory up to repo root.
1526
- */
1527
- declare function loadConfig(evalFilePath: string, repoRoot: string): Promise<AgentVConfig$1 | null>;
1528
- /**
1529
- * Extract target name from parsed eval suite (checks execution.target then falls back to root-level target).
1530
- */
1531
- declare function extractTargetFromSuite(suite: JsonObject): string | undefined;
1532
- /**
1533
- * Extract target refs from parsed eval suite.
1534
- * Supports both string shorthand and object form with hooks.
1535
- * Returns undefined when no targets array is specified.
1536
- */
1537
- declare function extractTargetRefsFromSuite(suite: JsonObject): readonly EvalTargetRef[] | undefined;
1538
- /**
1539
- * Extract target names from parsed eval suite (backward-compat wrapper).
1540
- * Precedence: execution.targets (array) > execution.target (singular).
1541
- * Returns undefined when no targets array is specified.
1542
- */
1543
- declare function extractTargetsFromSuite(suite: JsonObject): readonly string[] | undefined;
1544
- /**
1545
- * Extract workers count from suite-level execution block.
1546
- */
1547
- declare function extractWorkersFromSuite(suite: JsonObject): number | undefined;
1548
- /**
1549
- * Extract per-test targets array from a raw test case object.
1550
- */
1551
- declare function extractTargetsFromTestCase(testCase: JsonObject): readonly string[] | undefined;
1552
- /**
1553
- * Extract trials configuration from parsed eval suite's execution block.
1554
- * Returns undefined when count is 1 or not specified (no-op).
1555
- */
1556
- declare function extractTrialsConfig(suite: JsonObject): TrialsConfig | undefined;
1536
+ type CliNormalizedConfig = z.infer<typeof CliTargetConfigSchema>;
1557
1537
  /**
1558
- * Cache configuration parsed from execution block.
1538
+ * Resolved CLI configuration type derived from CliTargetConfigSchema.
1539
+ * This is the final validated shape used by the CLI provider at runtime.
1540
+ * Using Readonly to ensure immutability for runtime safety.
1559
1541
  */
1560
- interface CacheConfig {
1561
- readonly enabled: boolean;
1562
- readonly cachePath?: string;
1542
+ type CliResolvedConfig = Readonly<CliNormalizedConfig>;
1543
+ interface RetryConfig {
1544
+ readonly maxRetries?: number;
1545
+ readonly initialDelayMs?: number;
1546
+ readonly maxDelayMs?: number;
1547
+ readonly backoffFactor?: number;
1548
+ readonly retryableStatusCodes?: readonly number[];
1563
1549
  }
1564
1550
  /**
1565
- * Extract cache configuration from parsed eval suite's execution block.
1566
- * Returns undefined when no cache config is specified.
1567
- */
1568
- declare function extractCacheConfig(suite: JsonObject): CacheConfig | undefined;
1569
- /**
1570
- * Extract `execution.fail_on_error` from parsed eval suite.
1571
- * Accepts `true` or `false`.
1572
- * Returns undefined when not specified.
1551
+ * Selects which OpenAI-compatible API endpoint to use.
1552
+ * - "chat" (default): POST /chat/completions universally supported by all OpenAI-compatible providers.
1553
+ * - "responses": POST /responses — only supported by api.openai.com.
1554
+ *
1555
+ * Maps to Vercel AI SDK methods: "chat" → provider.chat(model), "responses" → provider(model).
1573
1556
  */
1574
- declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
1557
+ type ApiFormat = 'chat' | 'responses';
1575
1558
  /**
1576
- * Extract `execution.threshold` from parsed eval suite.
1577
- * Accepts a number in [0, 1] range.
1578
- * Returns undefined when not specified.
1559
+ * Azure OpenAI settings used by the Vercel AI SDK.
1579
1560
  */
1580
- declare function extractThreshold(suite: JsonObject): number | undefined;
1581
-
1561
+ interface AzureResolvedConfig {
1562
+ readonly resourceName: string;
1563
+ readonly deploymentName: string;
1564
+ readonly apiKey: string;
1565
+ readonly version?: string;
1566
+ readonly apiFormat?: ApiFormat;
1567
+ readonly temperature?: number;
1568
+ readonly maxOutputTokens?: number;
1569
+ readonly retry?: RetryConfig;
1570
+ }
1582
1571
  /**
1583
- * Formatting mode for segment content.
1584
- * - 'agent': File references only (for providers with filesystem access)
1585
- * - 'lm': Embedded file content with XML tags (for language model providers)
1572
+ * OpenAI-compatible settings used by the Vercel AI SDK.
1586
1573
  */
1587
- type FormattingMode = 'agent' | 'lm';
1588
-
1574
+ interface OpenAIResolvedConfig {
1575
+ readonly baseURL: string;
1576
+ readonly apiKey: string;
1577
+ readonly model: string;
1578
+ readonly apiFormat?: ApiFormat;
1579
+ readonly temperature?: number;
1580
+ readonly maxOutputTokens?: number;
1581
+ readonly retry?: RetryConfig;
1582
+ }
1589
1583
  /**
1590
- * Build prompt inputs by consolidating user request context.
1584
+ * OpenRouter settings used by the Vercel AI SDK provider.
1591
1585
  */
1592
- interface PromptInputs {
1593
- readonly question: string;
1594
- readonly chatPrompt?: ChatPrompt;
1595
- readonly systemMessage?: string;
1586
+ interface OpenRouterResolvedConfig {
1587
+ readonly apiKey: string;
1588
+ readonly model: string;
1589
+ readonly temperature?: number;
1590
+ readonly maxOutputTokens?: number;
1591
+ readonly retry?: RetryConfig;
1596
1592
  }
1597
1593
  /**
1598
- * Build prompt inputs by consolidating user request context.
1599
- *
1600
- * @param testCase - The evaluation test case
1601
- * @param mode - Formatting mode: 'agent' for file references, 'lm' for embedded content (default: 'lm')
1594
+ * Anthropic Claude settings used by the Vercel AI SDK.
1602
1595
  */
1603
- declare function buildPromptInputs(testCase: EvalTest, mode?: FormattingMode): Promise<PromptInputs>;
1604
-
1596
+ interface AnthropicResolvedConfig {
1597
+ readonly apiKey: string;
1598
+ readonly model: string;
1599
+ readonly temperature?: number;
1600
+ readonly maxOutputTokens?: number;
1601
+ readonly thinkingBudget?: number;
1602
+ readonly retry?: RetryConfig;
1603
+ }
1605
1604
  /**
1606
- * Detect file format by extension.
1605
+ * Google Gemini settings used by the Vercel AI SDK.
1607
1606
  */
1608
- declare function detectFormat(filePath: string): 'yaml' | 'jsonl' | 'agent-skills-json';
1609
-
1607
+ interface GeminiResolvedConfig {
1608
+ readonly apiKey: string;
1609
+ readonly model: string;
1610
+ readonly temperature?: number;
1611
+ readonly maxOutputTokens?: number;
1612
+ readonly retry?: RetryConfig;
1613
+ }
1614
+ interface CodexResolvedConfig {
1615
+ readonly model?: string;
1616
+ readonly executable: string;
1617
+ readonly args?: readonly string[];
1618
+ readonly cwd?: string;
1619
+ readonly timeoutMs?: number;
1620
+ readonly logDir?: string;
1621
+ readonly logFormat?: 'summary' | 'json';
1622
+ /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
1623
+ readonly streamLog?: false | 'raw' | 'summary';
1624
+ readonly systemPrompt?: string;
1625
+ }
1626
+ interface CopilotCliResolvedConfig {
1627
+ readonly executable: string;
1628
+ readonly model?: string;
1629
+ readonly args?: readonly string[];
1630
+ readonly cwd?: string;
1631
+ readonly timeoutMs?: number;
1632
+ readonly logDir?: string;
1633
+ readonly logFormat?: 'summary' | 'json';
1634
+ /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
1635
+ readonly streamLog?: false | 'raw' | 'summary';
1636
+ readonly systemPrompt?: string;
1637
+ }
1638
+ interface CopilotSdkResolvedConfig {
1639
+ readonly cliUrl?: string;
1640
+ readonly cliPath?: string;
1641
+ readonly githubToken?: string;
1642
+ readonly model?: string;
1643
+ readonly cwd?: string;
1644
+ readonly timeoutMs?: number;
1645
+ readonly logDir?: string;
1646
+ readonly logFormat?: 'summary' | 'json';
1647
+ /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
1648
+ readonly streamLog?: false | 'raw' | 'summary';
1649
+ readonly systemPrompt?: string;
1650
+ /** BYOK provider type: "azure", "openai", or "anthropic". */
1651
+ readonly byokType?: string;
1652
+ /** BYOK base URL for the provider endpoint. */
1653
+ readonly byokBaseUrl?: string;
1654
+ /** BYOK API key for authenticating with the provider. */
1655
+ readonly byokApiKey?: string;
1656
+ /** BYOK bearer token (takes precedence over apiKey when set). */
1657
+ readonly byokBearerToken?: string;
1658
+ /** BYOK Azure API version (e.g. "2024-10-21"). Only used when byokType is "azure". */
1659
+ readonly byokApiVersion?: string;
1660
+ /** BYOK wire API format: "completions" or "responses". */
1661
+ readonly byokWireApi?: string;
1662
+ }
1663
+ interface CopilotLogResolvedConfig {
1664
+ /** Explicit path to a session directory containing events.jsonl. */
1665
+ readonly sessionDir?: string;
1666
+ /** Session UUID — combined with sessionStateDir to build the path. */
1667
+ readonly sessionId?: string;
1668
+ /** Auto-discovery mode. 'latest' picks the most recent session. */
1669
+ readonly discover?: 'latest';
1670
+ /** Override the default ~/.copilot/session-state directory. */
1671
+ readonly sessionStateDir?: string;
1672
+ /** Filter discovery by working directory. */
1673
+ readonly cwd?: string;
1674
+ }
1675
+ interface PiCodingAgentResolvedConfig {
1676
+ readonly subprovider?: string;
1677
+ readonly model?: string;
1678
+ readonly apiKey?: string;
1679
+ readonly baseUrl?: string;
1680
+ readonly tools?: string;
1681
+ readonly thinking?: string;
1682
+ readonly cwd?: string;
1683
+ readonly timeoutMs?: number;
1684
+ readonly logDir?: string;
1685
+ readonly logFormat?: 'summary' | 'json';
1686
+ /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
1687
+ readonly streamLog?: false | 'raw' | 'summary';
1688
+ readonly systemPrompt?: string;
1689
+ }
1690
+ interface PiCliResolvedConfig {
1691
+ readonly executable: string;
1692
+ readonly subprovider?: string;
1693
+ readonly model?: string;
1694
+ readonly apiKey?: string;
1695
+ readonly baseUrl?: string;
1696
+ readonly tools?: string;
1697
+ readonly thinking?: string;
1698
+ readonly args?: readonly string[];
1699
+ readonly cwd?: string;
1700
+ readonly timeoutMs?: number;
1701
+ readonly logDir?: string;
1702
+ readonly logFormat?: 'summary' | 'json';
1703
+ /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
1704
+ readonly streamLog?: false | 'raw' | 'summary';
1705
+ readonly systemPrompt?: string;
1706
+ }
1707
+ interface ClaudeResolvedConfig {
1708
+ readonly executable: string;
1709
+ readonly model?: string;
1710
+ readonly systemPrompt?: string;
1711
+ readonly cwd?: string;
1712
+ readonly timeoutMs?: number;
1713
+ readonly maxTurns?: number;
1714
+ readonly maxBudgetUsd?: number;
1715
+ readonly logDir?: string;
1716
+ readonly logFormat?: 'summary' | 'json';
1717
+ /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
1718
+ readonly streamLog?: false | 'raw' | 'summary';
1719
+ }
1720
+ interface MockResolvedConfig {
1721
+ readonly response?: string;
1722
+ readonly delayMs?: number;
1723
+ readonly delayMinMs?: number;
1724
+ readonly delayMaxMs?: number;
1725
+ }
1726
+ interface VSCodeResolvedConfig {
1727
+ readonly executable: string;
1728
+ readonly waitForResponse: boolean;
1729
+ readonly dryRun: boolean;
1730
+ readonly subagentRoot?: string;
1731
+ readonly timeoutMs?: number;
1732
+ }
1733
+ interface AgentVResolvedConfig {
1734
+ readonly model: string;
1735
+ readonly temperature: number;
1736
+ }
1737
+ /** Base fields shared by all resolved targets. */
1738
+ interface ResolvedTargetBase {
1739
+ readonly name: string;
1740
+ readonly graderTarget?: string;
1741
+ readonly workers?: number;
1742
+ readonly providerBatching?: boolean;
1743
+ /**
1744
+ * Whether this target can be executed via executor subagents in subagent mode.
1745
+ * Defaults to `true` for all non-CLI providers. Set `false` in targets.yaml
1746
+ * to force CLI invocation even in subagent mode.
1747
+ */
1748
+ readonly subagentModeAllowed?: boolean;
1749
+ /**
1750
+ * Ordered list of target names to try when the primary target fails after
1751
+ * exhausting retries. Each fallback is attempted in order.
1752
+ */
1753
+ readonly fallbackTargets?: readonly string[];
1754
+ }
1755
+ type ResolvedTarget = (ResolvedTargetBase & {
1756
+ readonly kind: 'openai';
1757
+ readonly config: OpenAIResolvedConfig;
1758
+ }) | (ResolvedTargetBase & {
1759
+ readonly kind: 'openrouter';
1760
+ readonly config: OpenRouterResolvedConfig;
1761
+ }) | (ResolvedTargetBase & {
1762
+ readonly kind: 'azure';
1763
+ readonly config: AzureResolvedConfig;
1764
+ }) | (ResolvedTargetBase & {
1765
+ readonly kind: 'anthropic';
1766
+ readonly config: AnthropicResolvedConfig;
1767
+ }) | (ResolvedTargetBase & {
1768
+ readonly kind: 'gemini';
1769
+ readonly config: GeminiResolvedConfig;
1770
+ }) | (ResolvedTargetBase & {
1771
+ readonly kind: 'codex';
1772
+ readonly config: CodexResolvedConfig;
1773
+ }) | (ResolvedTargetBase & {
1774
+ readonly kind: 'copilot-sdk';
1775
+ readonly config: CopilotSdkResolvedConfig;
1776
+ }) | (ResolvedTargetBase & {
1777
+ readonly kind: 'copilot-cli';
1778
+ readonly config: CopilotCliResolvedConfig;
1779
+ }) | (ResolvedTargetBase & {
1780
+ readonly kind: 'copilot-log';
1781
+ readonly config: CopilotLogResolvedConfig;
1782
+ }) | (ResolvedTargetBase & {
1783
+ readonly kind: 'pi-coding-agent';
1784
+ readonly config: PiCodingAgentResolvedConfig;
1785
+ }) | (ResolvedTargetBase & {
1786
+ readonly kind: 'pi-cli';
1787
+ readonly config: PiCliResolvedConfig;
1788
+ }) | (ResolvedTargetBase & {
1789
+ readonly kind: 'claude';
1790
+ readonly config: ClaudeResolvedConfig;
1791
+ }) | (ResolvedTargetBase & {
1792
+ readonly kind: 'claude-cli';
1793
+ readonly config: ClaudeResolvedConfig;
1794
+ }) | (ResolvedTargetBase & {
1795
+ readonly kind: 'claude-sdk';
1796
+ readonly config: ClaudeResolvedConfig;
1797
+ }) | (ResolvedTargetBase & {
1798
+ readonly kind: 'mock';
1799
+ readonly config: MockResolvedConfig;
1800
+ }) | (ResolvedTargetBase & {
1801
+ readonly kind: 'vscode' | 'vscode-insiders';
1802
+ readonly config: VSCodeResolvedConfig;
1803
+ }) | (ResolvedTargetBase & {
1804
+ readonly kind: 'agentv';
1805
+ readonly config: AgentVResolvedConfig;
1806
+ }) | (ResolvedTargetBase & {
1807
+ readonly kind: 'cli';
1808
+ readonly config: CliResolvedConfig;
1809
+ }) | (ResolvedTargetBase & {
1810
+ readonly kind: 'transcript';
1811
+ readonly config: Record<string, never>;
1812
+ });
1813
+ /**
1814
+ * Optional settings accepted on ALL target definitions regardless of provider.
1815
+ * Exported so the targets validator can reuse the same list — adding a field
1816
+ * here automatically makes it valid in targets.yaml without a separate update.
1817
+ */
1818
+ declare const COMMON_TARGET_SETTINGS: readonly ["use_target", "provider_batching", "subagent_mode_allowed", "fallback_targets"];
1819
+ declare function resolveDelegatedTargetDefinition(name: string, definitions: ReadonlyMap<string, TargetDefinition>, env?: EnvLookup): TargetDefinition | undefined;
1820
+ declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string, options?: {
1821
+ readonly emitDeprecationWarnings?: boolean;
1822
+ }): ResolvedTarget;
1823
+
1824
+ /**
1825
+ * Extensible provider registry.
1826
+ *
1827
+ * Replaces the hardcoded switch/case dispatch in createProvider() with
1828
+ * a registry of named factory functions. Built-in providers are registered
1829
+ * at startup; users can add custom providers via the registry API or by
1830
+ * dropping files in `.agentv/providers/`.
1831
+ */
1832
+
1833
+ /**
1834
+ * Factory function that creates a Provider instance from a resolved target.
1835
+ */
1836
+ type ProviderFactoryFn = (target: ResolvedTarget) => Provider;
1837
+ /**
1838
+ * Registry of provider factory functions keyed by provider kind.
1839
+ *
1840
+ * Built-in providers are registered at startup. Custom providers can be
1841
+ * registered via the `register()` method.
1842
+ */
1843
+ declare class ProviderRegistry {
1844
+ private readonly factories;
1845
+ /** Register a factory function for a provider kind. */
1846
+ register(kind: string, factory: ProviderFactoryFn): this;
1847
+ /** Get the factory function for a provider kind. */
1848
+ get(kind: string): ProviderFactoryFn | undefined;
1849
+ /** Check if a factory is registered for the given kind. */
1850
+ has(kind: string): boolean;
1851
+ /** List all registered provider kind names. */
1852
+ list(): string[];
1853
+ /**
1854
+ * Create a provider instance from a resolved target.
1855
+ * Falls back to CLI provider for unknown kinds (custom provider escape hatch).
1856
+ */
1857
+ create(target: ResolvedTarget): Provider;
1858
+ }
1859
+
1860
+ declare const MetadataSchema: z.ZodObject<{
1861
+ name: z.ZodString;
1862
+ description: z.ZodOptional<z.ZodString>;
1863
+ version: z.ZodOptional<z.ZodString>;
1864
+ author: z.ZodOptional<z.ZodString>;
1865
+ tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
1866
+ license: z.ZodOptional<z.ZodString>;
1867
+ requires: z.ZodOptional<z.ZodObject<{
1868
+ agentv: z.ZodOptional<z.ZodString>;
1869
+ }, "strip", z.ZodTypeAny, {
1870
+ agentv?: string | undefined;
1871
+ }, {
1872
+ agentv?: string | undefined;
1873
+ }>>;
1874
+ }, "strip", z.ZodTypeAny, {
1875
+ name: string;
1876
+ description?: string | undefined;
1877
+ version?: string | undefined;
1878
+ author?: string | undefined;
1879
+ tags?: string[] | undefined;
1880
+ license?: string | undefined;
1881
+ requires?: {
1882
+ agentv?: string | undefined;
1883
+ } | undefined;
1884
+ }, {
1885
+ name: string;
1886
+ description?: string | undefined;
1887
+ version?: string | undefined;
1888
+ author?: string | undefined;
1889
+ tags?: string[] | undefined;
1890
+ license?: string | undefined;
1891
+ requires?: {
1892
+ agentv?: string | undefined;
1893
+ } | undefined;
1894
+ }>;
1895
+ type EvalMetadata = z.infer<typeof MetadataSchema>;
1896
+
1897
+ declare const DEFAULT_EVAL_PATTERNS: readonly string[];
1898
+ type ExecutionDefaults = {
1899
+ readonly verbose?: boolean;
1900
+ readonly keep_workspaces?: boolean;
1901
+ readonly otel_file?: string;
1902
+ readonly export_otel?: boolean;
1903
+ readonly otel_backend?: string;
1904
+ readonly otel_capture_content?: boolean;
1905
+ readonly otel_group_turns?: boolean;
1906
+ readonly pool_workspaces?: boolean;
1907
+ readonly pool_slots?: number;
1908
+ };
1909
+ type ResultsExportConfig = {
1910
+ readonly repo: string;
1911
+ readonly path: string;
1912
+ readonly auto_push?: boolean;
1913
+ readonly branch_prefix?: string;
1914
+ };
1915
+ type AgentVConfig$1 = {
1916
+ readonly required_version?: string;
1917
+ readonly eval_patterns?: readonly string[];
1918
+ readonly execution?: ExecutionDefaults;
1919
+ readonly results?: {
1920
+ readonly export?: ResultsExportConfig;
1921
+ };
1922
+ };
1923
+ /**
1924
+ * Load optional .agentv/config.yaml configuration file.
1925
+ * Searches from eval file directory up to repo root.
1926
+ */
1927
+ declare function loadConfig(evalFilePath: string, repoRoot: string): Promise<AgentVConfig$1 | null>;
1928
+ /**
1929
+ * Extract target name from parsed eval suite (checks execution.target then falls back to root-level target).
1930
+ */
1931
+ declare function extractTargetFromSuite(suite: JsonObject): string | undefined;
1932
+ /**
1933
+ * Extract target refs from parsed eval suite.
1934
+ * Supports both string shorthand and object form with hooks.
1935
+ * Returns undefined when no targets array is specified.
1936
+ */
1937
+ declare function extractTargetRefsFromSuite(suite: JsonObject): readonly EvalTargetRef[] | undefined;
1938
+ /**
1939
+ * Extract target names from parsed eval suite (backward-compat wrapper).
1940
+ * Precedence: execution.targets (array) > execution.target (singular).
1941
+ * Returns undefined when no targets array is specified.
1942
+ */
1943
+ declare function extractTargetsFromSuite(suite: JsonObject): readonly string[] | undefined;
1944
+ /**
1945
+ * Extract workers count from suite-level execution block.
1946
+ */
1947
+ declare function extractWorkersFromSuite(suite: JsonObject): number | undefined;
1948
+ /**
1949
+ * Extract per-test targets array from a raw test case object.
1950
+ */
1951
+ declare function extractTargetsFromTestCase(testCase: JsonObject): readonly string[] | undefined;
1952
+ /**
1953
+ * Extract trials configuration from parsed eval suite's execution block.
1954
+ * Returns undefined when count is 1 or not specified (no-op).
1955
+ */
1956
+ declare function extractTrialsConfig(suite: JsonObject): TrialsConfig | undefined;
1957
+ /**
1958
+ * Cache configuration parsed from execution block.
1959
+ */
1960
+ interface CacheConfig {
1961
+ readonly enabled: boolean;
1962
+ readonly cachePath?: string;
1963
+ }
1964
+ /**
1965
+ * Extract cache configuration from parsed eval suite's execution block.
1966
+ * Returns undefined when no cache config is specified.
1967
+ */
1968
+ declare function extractCacheConfig(suite: JsonObject): CacheConfig | undefined;
1969
+ /**
1970
+ * Extract `execution.fail_on_error` from parsed eval suite.
1971
+ * Accepts `true` or `false`.
1972
+ * Returns undefined when not specified.
1973
+ */
1974
+ declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
1975
+ /**
1976
+ * Extract `execution.threshold` from parsed eval suite.
1977
+ * Accepts a number in [0, 1] range.
1978
+ * Returns undefined when not specified.
1979
+ */
1980
+ declare function extractThreshold(suite: JsonObject): number | undefined;
1981
+
1982
+ /**
1983
+ * Formatting mode for segment content.
1984
+ * - 'agent': File references only (for providers with filesystem access)
1985
+ * - 'lm': Embedded file content with XML tags (for language model providers)
1986
+ */
1987
+ type FormattingMode = 'agent' | 'lm';
1988
+
1989
+ /**
1990
+ * Build prompt inputs by consolidating user request context.
1991
+ */
1992
+ interface PromptInputs {
1993
+ readonly question: string;
1994
+ readonly chatPrompt?: ChatPrompt;
1995
+ readonly systemMessage?: string;
1996
+ }
1997
+ /**
1998
+ * Build prompt inputs by consolidating user request context.
1999
+ *
2000
+ * @param testCase - The evaluation test case
2001
+ * @param mode - Formatting mode: 'agent' for file references, 'lm' for embedded content (default: 'lm')
2002
+ */
2003
+ declare function buildPromptInputs(testCase: EvalTest, mode?: FormattingMode): Promise<PromptInputs>;
2004
+
2005
+ /**
2006
+ * Detect file format by extension.
2007
+ */
2008
+ declare function detectFormat(filePath: string): 'yaml' | 'jsonl' | 'agent-skills-json' | 'typescript';
2009
+
1610
2010
  type LoadOptions = {
1611
2011
  readonly verbose?: boolean;
1612
2012
  /** Filter tests by ID pattern(s) (glob supported, e.g., "summary-*"). Arrays use OR logic. */
@@ -1642,13 +2042,17 @@ type EvalSuiteResult = {
1642
2042
  /** Suite-level metadata (name, description, version, etc.) */
1643
2043
  readonly metadata?: EvalMetadata;
1644
2044
  /** Suite-level total cost budget in USD */
1645
- readonly totalBudgetUsd?: number;
2045
+ readonly budgetUsd?: number;
1646
2046
  /** Execution error tolerance: true or false */
1647
2047
  readonly failOnError?: FailOnError;
1648
2048
  /** Suite-level quality threshold (0-1) — suite fails if mean score is below */
1649
2049
  readonly threshold?: number;
1650
2050
  /** Resolved workspace.path from the eval YAML (after env-var expansion), if set */
1651
2051
  readonly workspacePath?: string;
2052
+ /** Inline target definition from a TS eval config. */
2053
+ readonly inlineTarget?: TargetDefinition;
2054
+ /** Custom provider factory from a TS eval config task(). */
2055
+ readonly providerFactory?: ProviderFactoryFn;
1652
2056
  };
1653
2057
  /**
1654
2058
  * Load tests and suite metadata from a single parse.
@@ -1695,495 +2099,370 @@ declare function isAgentSkillsFormat(parsed: unknown): parsed is AgentSkillsEval
1695
2099
  declare function parseAgentSkillsEvals(parsed: unknown, source?: string, baseDir?: string): readonly EvalTest[];
1696
2100
 
1697
2101
  /**
1698
- * EVAL.yaml evals.json transpiler.
2102
+ * Types for inline assertion functions used in the evaluate() API.
1699
2103
  *
1700
- * Converts an AgentV EVAL.yaml file into Agent Skills evals.json format
1701
- * for consumption by the skill-creator pipeline.
2104
+ * Inline functions are the escape hatch for custom evaluation logic
2105
+ * that doesn't fit a built-in grader type. For built-in assertions
2106
+ * (contains, regex, is-json, etc.), use config objects instead:
1702
2107
  *
1703
- * Handles both `assertions:` (current) and `assert:` (deprecated alias).
1704
- */
1705
- interface EvalsJsonCase {
1706
- id: number;
1707
- prompt: string;
1708
- expected_output?: string;
1709
- files?: string[];
1710
- should_trigger?: boolean;
1711
- assertions: string[];
1712
- }
1713
- interface EvalsJsonFile {
1714
- skill_name: string;
1715
- evals: EvalsJsonCase[];
1716
- }
1717
- /**
1718
- * Result of transpiling a single EVAL.yaml.
1719
- * May produce multiple evals.json files (one per skill).
1720
- */
1721
- interface TranspileResult {
1722
- /** Map from skill_name → EvalsJsonFile */
1723
- files: Map<string, EvalsJsonFile>;
1724
- /** Warning messages accumulated during transpilation */
1725
- warnings: string[];
1726
- }
1727
- /**
1728
- * Transpile a parsed EVAL.yaml object into one or more evals.json objects.
2108
+ * assert: [{ type: 'contains', value: 'hello' }]
1729
2109
  *
1730
- * @param suite Parsed YAML object (already loaded, no file I/O here)
1731
- * @param source Source identifier for error messages (e.g. file path)
1732
- */
1733
- declare function transpileEvalYaml(suite: unknown, source?: string): TranspileResult;
1734
- /**
1735
- * Transpile an EVAL.yaml file into one or more evals.json objects.
1736
- * Returns a map from output filename → JSON content.
2110
+ * Inline functions are for custom logic:
1737
2111
  *
1738
- * @param evalYamlPath Absolute path to the EVAL.yaml file
1739
- */
1740
- declare function transpileEvalYamlFile(evalYamlPath: string): TranspileResult;
1741
- /**
1742
- * Determine the output filename(s) for a transpile result.
1743
- * Single skill → "evals.json"
1744
- * Multiple skills → "<skill>.evals.json"
1745
- */
1746
- declare function getOutputFilenames(result: TranspileResult): Map<string, string>;
1747
-
1748
- declare function fileExists(filePath: string): Promise<boolean>;
1749
- /**
1750
- * Normalize line endings to LF (\n).
1751
- * This ensures consistent behavior across Windows (CRLF) and Unix (LF) systems.
1752
- */
1753
- declare function normalizeLineEndings(content: string): string;
1754
- /**
1755
- * Read a text file and normalize line endings to LF (\n).
1756
- * This ensures consistent behavior across Windows (CRLF) and Unix (LF) systems.
1757
- */
1758
- declare function readTextFile(filePath: string): Promise<string>;
1759
- /**
1760
- * Read a JSON file and parse it.
1761
- */
1762
- declare function readJsonFile<T = unknown>(filePath: string): Promise<T>;
1763
- /**
1764
- * Find git repository root by walking up the directory tree.
1765
- */
1766
- declare function findGitRoot(startPath: string): Promise<string | null>;
1767
- /**
1768
- * Build a chain of directories walking from a file's location up to repo root.
1769
- * Used for discovering configuration files like targets.yaml or config.yaml.
1770
- */
1771
- declare function buildDirectoryChain(filePath: string, repoRoot: string): readonly string[];
1772
- /**
1773
- * Build search roots for file resolution, matching yaml-parser behavior.
1774
- * Searches from eval file directory up to repo root.
1775
- */
1776
- declare function buildSearchRoots(evalPath: string, repoRoot: string): readonly string[];
1777
- /**
1778
- * Resolve a file reference using search roots, matching yaml-parser behavior.
2112
+ * assert: [({ output }) => ({ name: 'len', score: output.length > 5 ? 1 : 0 })]
1779
2113
  */
1780
- declare function resolveFileReference(rawValue: string, searchRoots: readonly string[]): Promise<{
1781
- readonly displayPath: string;
1782
- readonly resolvedPath?: string;
1783
- readonly attempted: readonly string[];
1784
- }>;
2114
+ /** Context passed to inline assertion functions */
2115
+ interface AssertContext {
2116
+ readonly input: string;
2117
+ readonly output: string;
2118
+ readonly expectedOutput?: string;
2119
+ readonly criteria?: string;
2120
+ readonly metadata?: Record<string, unknown>;
2121
+ }
2122
+ /** Result from an inline assertion function */
2123
+ interface AssertResult {
2124
+ readonly name: string;
2125
+ readonly score: number;
2126
+ readonly metadata?: Record<string, unknown>;
2127
+ }
2128
+ /** Inline assertion function signature */
2129
+ type AssertFn = (ctx: AssertContext) => AssertResult | Promise<AssertResult>;
1785
2130
 
1786
2131
  /**
1787
- * Strict normalized schema for CLI target configuration.
1788
- * This is the final validated shape after environment variable resolution
1789
- * and internal field normalization.
2132
+ * Programmatic API for running evaluations.
1790
2133
  *
1791
- * Uses .strict() to reject unknown properties, ensuring configuration
1792
- * errors are caught early rather than silently ignored.
2134
+ * Provides `evaluate()` a high-level function for using AgentV as a library
2135
+ * instead of a CLI. The config shape mirrors the YAML structure for easy
2136
+ * translation between file-based and programmatic usage.
1793
2137
  *
1794
- * @example
2138
+ * @example Inline tests with config objects
1795
2139
  * ```typescript
1796
- * const config: CliNormalizedConfig = {
1797
- * command: 'agent run {PROMPT}',
1798
- * timeoutMs: 120000,
1799
- * verbose: true,
1800
- * };
1801
- * CliTargetConfigSchema.parse(config); // Validates the normalized config
2140
+ * import { evaluate } from '@agentv/core';
2141
+ *
2142
+ * const results = await evaluate({
2143
+ * tests: [
2144
+ * {
2145
+ * id: 'capital',
2146
+ * input: 'What is the capital of France?',
2147
+ * expectedOutput: 'Paris',
2148
+ * assert: [{ type: 'contains', value: 'Paris' }],
2149
+ * },
2150
+ * ],
2151
+ * target: { provider: 'mock_agent' },
2152
+ * });
2153
+ *
2154
+ * console.log(results.summary.passed, 'passed');
2155
+ * ```
2156
+ *
2157
+ * @example Inline tests with task function and custom assertion
2158
+ * ```typescript
2159
+ * import { evaluate } from '@agentv/core';
2160
+ *
2161
+ * const { summary } = await evaluate({
2162
+ * tests: [
2163
+ * {
2164
+ * id: 'echo',
2165
+ * input: 'hello',
2166
+ * expectedOutput: 'Echo: hello',
2167
+ * assert: [
2168
+ * { type: 'contains', value: 'hello' },
2169
+ * { type: 'equals' },
2170
+ * ({ output }) => ({ name: 'custom', score: output.length > 0 ? 1 : 0 }),
2171
+ * ],
2172
+ * },
2173
+ * ],
2174
+ * task: async (input) => `Echo: ${input}`,
2175
+ * });
1802
2176
  * ```
1803
- */
1804
- declare const CliTargetConfigSchema: z.ZodObject<{
1805
- command: z.ZodString;
1806
- filesFormat: z.ZodOptional<z.ZodString>;
1807
- cwd: z.ZodOptional<z.ZodString>;
1808
- timeoutMs: z.ZodOptional<z.ZodNumber>;
1809
- healthcheck: z.ZodOptional<z.ZodUnion<[z.ZodObject<{
1810
- url: z.ZodString;
1811
- timeoutMs: z.ZodOptional<z.ZodNumber>;
1812
- }, "strict", z.ZodTypeAny, {
1813
- url: string;
1814
- timeoutMs?: number | undefined;
1815
- }, {
1816
- url: string;
1817
- timeoutMs?: number | undefined;
1818
- }>, z.ZodObject<{
1819
- command: z.ZodString;
1820
- cwd: z.ZodOptional<z.ZodString>;
1821
- timeoutMs: z.ZodOptional<z.ZodNumber>;
1822
- }, "strict", z.ZodTypeAny, {
1823
- command: string;
1824
- timeoutMs?: number | undefined;
1825
- cwd?: string | undefined;
1826
- }, {
1827
- command: string;
1828
- timeoutMs?: number | undefined;
1829
- cwd?: string | undefined;
1830
- }>]>>;
1831
- verbose: z.ZodOptional<z.ZodBoolean>;
1832
- keepTempFiles: z.ZodOptional<z.ZodBoolean>;
1833
- }, "strict", z.ZodTypeAny, {
1834
- command: string;
1835
- timeoutMs?: number | undefined;
1836
- cwd?: string | undefined;
1837
- verbose?: boolean | undefined;
1838
- healthcheck?: {
1839
- url: string;
1840
- timeoutMs?: number | undefined;
1841
- } | {
1842
- command: string;
1843
- timeoutMs?: number | undefined;
1844
- cwd?: string | undefined;
1845
- } | undefined;
1846
- filesFormat?: string | undefined;
1847
- keepTempFiles?: boolean | undefined;
1848
- }, {
1849
- command: string;
1850
- timeoutMs?: number | undefined;
1851
- cwd?: string | undefined;
1852
- verbose?: boolean | undefined;
1853
- healthcheck?: {
1854
- url: string;
1855
- timeoutMs?: number | undefined;
1856
- } | {
1857
- command: string;
1858
- timeoutMs?: number | undefined;
1859
- cwd?: string | undefined;
1860
- } | undefined;
1861
- filesFormat?: string | undefined;
1862
- keepTempFiles?: boolean | undefined;
1863
- }>;
1864
- type CliNormalizedConfig = z.infer<typeof CliTargetConfigSchema>;
1865
- /**
1866
- * Resolved CLI configuration type derived from CliTargetConfigSchema.
1867
- * This is the final validated shape used by the CLI provider at runtime.
1868
- * Using Readonly to ensure immutability for runtime safety.
1869
- */
1870
- type CliResolvedConfig = Readonly<CliNormalizedConfig>;
1871
- interface RetryConfig {
1872
- readonly maxRetries?: number;
1873
- readonly initialDelayMs?: number;
1874
- readonly maxDelayMs?: number;
1875
- readonly backoffFactor?: number;
1876
- readonly retryableStatusCodes?: readonly number[];
1877
- }
1878
- /**
1879
- * Selects which OpenAI-compatible API endpoint to use.
1880
- * - "chat" (default): POST /chat/completions — universally supported by all OpenAI-compatible providers.
1881
- * - "responses": POST /responses — only supported by api.openai.com.
1882
2177
  *
1883
- * Maps to Vercel AI SDK methods: "chat" → provider.chat(model), "responses" → provider(model).
2178
+ * @example File-based
2179
+ * ```typescript
2180
+ * const results = await evaluate({
2181
+ * specFile: './evals/EVAL.yaml',
2182
+ * target: { provider: 'claude_agent' },
2183
+ * });
2184
+ * ```
2185
+ *
2186
+ * @module
1884
2187
  */
1885
- type ApiFormat = 'chat' | 'responses';
2188
+
1886
2189
  /**
1887
- * Azure OpenAI settings used by the Vercel AI SDK.
2190
+ * Inline test definition for the programmatic API.
2191
+ * Mirrors the YAML test structure.
1888
2192
  */
1889
- interface AzureResolvedConfig {
1890
- readonly resourceName: string;
1891
- readonly deploymentName: string;
1892
- readonly apiKey: string;
1893
- readonly version?: string;
1894
- readonly apiFormat?: ApiFormat;
1895
- readonly temperature?: number;
1896
- readonly maxOutputTokens?: number;
1897
- readonly retry?: RetryConfig;
2193
+ interface EvalTestInput {
2194
+ /** Unique test identifier */
2195
+ readonly id: string;
2196
+ /** What the response should accomplish */
2197
+ readonly criteria?: string;
2198
+ /** Input to the agent (string or message array). Omit when using turns[]. */
2199
+ readonly input?: string | readonly {
2200
+ role: string;
2201
+ content: string;
2202
+ }[];
2203
+ /** Expected reference output (camelCase preferred) */
2204
+ readonly expectedOutput?: string;
2205
+ /** @deprecated Use `expectedOutput` instead */
2206
+ readonly expected_output?: string;
2207
+ /** Assertion graders — accepts factory functions, config objects, or inline functions */
2208
+ readonly assert?: readonly AssertEntry[];
2209
+ /** Arbitrary metadata */
2210
+ readonly metadata?: Record<string, unknown>;
2211
+ /** Enable multi-turn conversation mode. Inferred automatically when turns[] is provided. */
2212
+ readonly mode?: 'conversation';
2213
+ /** Ordered turns for conversation evaluation. Each turn generates a fresh LLM call. */
2214
+ readonly turns?: readonly ConversationTurnInput[];
2215
+ /** Score aggregation across turns: 'mean' (default), 'min', or 'max'. */
2216
+ readonly aggregation?: ConversationAggregation;
1898
2217
  }
1899
2218
  /**
1900
- * OpenAI-compatible settings used by the Vercel AI SDK.
2219
+ * A single turn in a multi-turn conversation evaluation (programmatic API).
2220
+ * Mirrors the YAML `turns` structure with camelCase naming.
1901
2221
  */
1902
- interface OpenAIResolvedConfig {
1903
- readonly baseURL: string;
1904
- readonly apiKey: string;
1905
- readonly model: string;
1906
- readonly apiFormat?: ApiFormat;
1907
- readonly temperature?: number;
1908
- readonly maxOutputTokens?: number;
1909
- readonly retry?: RetryConfig;
2222
+ interface ConversationTurnInput {
2223
+ /** Input for this turn (string or message array) */
2224
+ readonly input: string | readonly {
2225
+ role: string;
2226
+ content: string;
2227
+ }[];
2228
+ /** Expected reference output for this turn */
2229
+ readonly expectedOutput?: string;
2230
+ /** @deprecated Use `expectedOutput` instead */
2231
+ readonly expected_output?: string;
2232
+ /** Per-turn assertions (string criteria or grader config) */
2233
+ readonly assert?: readonly AssertEntry[];
1910
2234
  }
1911
2235
  /**
1912
- * OpenRouter settings used by the Vercel AI SDK provider.
2236
+ * Inline assertion definition for the programmatic API.
2237
+ * Matches the YAML `assert` block structure.
1913
2238
  */
1914
- interface OpenRouterResolvedConfig {
1915
- readonly apiKey: string;
1916
- readonly model: string;
1917
- readonly temperature?: number;
1918
- readonly maxOutputTokens?: number;
1919
- readonly retry?: RetryConfig;
2239
+ interface EvalAssertionInput {
2240
+ /** Assertion type (e.g., 'contains', 'llm-grader', 'code-grader') */
2241
+ readonly type: string;
2242
+ /** Display name */
2243
+ readonly name?: string;
2244
+ /** Value for deterministic assertions (contains, equals, regex) */
2245
+ readonly value?: string;
2246
+ /** Weight for scoring */
2247
+ readonly weight?: number;
2248
+ /** Whether this assertion is required to pass */
2249
+ readonly required?: boolean | number;
2250
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
2251
+ readonly min_score?: number;
2252
+ /** Prompt file for llm_grader */
2253
+ readonly prompt?: string;
2254
+ /** Script for code_grader */
2255
+ readonly script?: string | readonly string[];
2256
+ /** Additional config passed to the assertion */
2257
+ readonly config?: Record<string, unknown>;
2258
+ /** Nested assertions for composite type */
2259
+ readonly assert?: readonly EvalAssertionInput[];
2260
+ /** Rubric criteria for rubrics type */
2261
+ readonly criteria?: readonly (string | {
2262
+ id?: string;
2263
+ outcome: string;
2264
+ weight?: number;
2265
+ })[];
2266
+ /** Additional properties */
2267
+ readonly [key: string]: unknown;
1920
2268
  }
2269
+ /** Assert entry: inline function or config object */
2270
+ type AssertEntry = AssertFn | EvalAssertionInput;
1921
2271
  /**
1922
- * Anthropic Claude settings used by the Vercel AI SDK.
2272
+ * Configuration for `evaluate()`.
2273
+ * Accepts either inline tests or a spec file path.
1923
2274
  */
1924
- interface AnthropicResolvedConfig {
1925
- readonly apiKey: string;
1926
- readonly model: string;
1927
- readonly temperature?: number;
1928
- readonly maxOutputTokens?: number;
1929
- readonly thinkingBudget?: number;
1930
- readonly retry?: RetryConfig;
2275
+ interface EvalConfig {
2276
+ /** Inline test definitions (mutually exclusive with specFile) */
2277
+ readonly tests?: readonly EvalTestInput[];
2278
+ /** Path to an EVAL.yaml spec file (mutually exclusive with tests) */
2279
+ readonly specFile?: string;
2280
+ /** Target provider configuration */
2281
+ readonly target?: TargetDefinition;
2282
+ /** Custom task function — mutually exclusive with target */
2283
+ readonly task?: (input: string) => string | Promise<string>;
2284
+ /** Suite-level assertions applied to all tests */
2285
+ readonly assert?: readonly AssertEntry[];
2286
+ /** Optional suite metadata used by CLI discovery, tagging, and reporting. */
2287
+ readonly metadata?: EvalMetadata;
2288
+ /** Filter tests by ID pattern(s) (glob supported). Arrays use OR logic. */
2289
+ readonly filter?: string | readonly string[];
2290
+ /** Maximum concurrent workers (default: 3) */
2291
+ readonly workers?: number;
2292
+ /** Maximum retries on failure (default: 2) */
2293
+ readonly maxRetries?: number;
2294
+ /** Agent timeout in milliseconds. No timeout if not set. */
2295
+ readonly agentTimeoutMs?: number;
2296
+ /** Enable response caching */
2297
+ readonly cache?: boolean;
2298
+ /** Verbose logging */
2299
+ readonly verbose?: boolean;
2300
+ /** Callback for each completed result */
2301
+ readonly onResult?: (result: EvaluationResult) => void;
2302
+ /** Score threshold for pass/fail (0-1). Default: 0.8 (DEFAULT_THRESHOLD). */
2303
+ readonly threshold?: number;
2304
+ /** Command(s) to run once before the suite starts. Same semantics as YAML before_all. */
2305
+ readonly beforeAll?: string | readonly string[];
2306
+ /** Suite-level cost cap in USD. Stops dispatching new tests when exceeded. */
2307
+ readonly budgetUsd?: number;
1931
2308
  }
1932
2309
  /**
1933
- * Google Gemini settings used by the Vercel AI SDK.
2310
+ * Summary statistics for an evaluation run.
1934
2311
  */
1935
- interface GeminiResolvedConfig {
1936
- readonly apiKey: string;
1937
- readonly model: string;
1938
- readonly temperature?: number;
1939
- readonly maxOutputTokens?: number;
1940
- readonly retry?: RetryConfig;
1941
- }
1942
- interface CodexResolvedConfig {
1943
- readonly model?: string;
1944
- readonly executable: string;
1945
- readonly args?: readonly string[];
1946
- readonly cwd?: string;
1947
- readonly timeoutMs?: number;
1948
- readonly logDir?: string;
1949
- readonly logFormat?: 'summary' | 'json';
1950
- /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
1951
- readonly streamLog?: false | 'raw' | 'summary';
1952
- readonly systemPrompt?: string;
1953
- }
1954
- interface CopilotCliResolvedConfig {
1955
- readonly executable: string;
1956
- readonly model?: string;
1957
- readonly args?: readonly string[];
1958
- readonly cwd?: string;
1959
- readonly timeoutMs?: number;
1960
- readonly logDir?: string;
1961
- readonly logFormat?: 'summary' | 'json';
1962
- /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
1963
- readonly streamLog?: false | 'raw' | 'summary';
1964
- readonly systemPrompt?: string;
1965
- }
1966
- interface CopilotSdkResolvedConfig {
1967
- readonly cliUrl?: string;
1968
- readonly cliPath?: string;
1969
- readonly githubToken?: string;
1970
- readonly model?: string;
1971
- readonly cwd?: string;
1972
- readonly timeoutMs?: number;
1973
- readonly logDir?: string;
1974
- readonly logFormat?: 'summary' | 'json';
1975
- /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
1976
- readonly streamLog?: false | 'raw' | 'summary';
1977
- readonly systemPrompt?: string;
1978
- /** BYOK provider type: "azure", "openai", or "anthropic". */
1979
- readonly byokType?: string;
1980
- /** BYOK base URL for the provider endpoint. */
1981
- readonly byokBaseUrl?: string;
1982
- /** BYOK API key for authenticating with the provider. */
1983
- readonly byokApiKey?: string;
1984
- /** BYOK bearer token (takes precedence over apiKey when set). */
1985
- readonly byokBearerToken?: string;
1986
- /** BYOK Azure API version (e.g. "2024-10-21"). Only used when byokType is "azure". */
1987
- readonly byokApiVersion?: string;
1988
- /** BYOK wire API format: "completions" or "responses". */
1989
- readonly byokWireApi?: string;
1990
- }
1991
- interface CopilotLogResolvedConfig {
1992
- /** Explicit path to a session directory containing events.jsonl. */
1993
- readonly sessionDir?: string;
1994
- /** Session UUID — combined with sessionStateDir to build the path. */
1995
- readonly sessionId?: string;
1996
- /** Auto-discovery mode. 'latest' picks the most recent session. */
1997
- readonly discover?: 'latest';
1998
- /** Override the default ~/.copilot/session-state directory. */
1999
- readonly sessionStateDir?: string;
2000
- /** Filter discovery by working directory. */
2001
- readonly cwd?: string;
2002
- }
2003
- interface PiCodingAgentResolvedConfig {
2004
- readonly subprovider?: string;
2005
- readonly model?: string;
2006
- readonly apiKey?: string;
2007
- readonly baseUrl?: string;
2008
- readonly tools?: string;
2009
- readonly thinking?: string;
2010
- readonly cwd?: string;
2011
- readonly timeoutMs?: number;
2012
- readonly logDir?: string;
2013
- readonly logFormat?: 'summary' | 'json';
2014
- /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
2015
- readonly streamLog?: false | 'raw' | 'summary';
2016
- readonly systemPrompt?: string;
2017
- }
2018
- interface PiCliResolvedConfig {
2019
- readonly executable: string;
2020
- readonly subprovider?: string;
2021
- readonly model?: string;
2022
- readonly apiKey?: string;
2023
- readonly baseUrl?: string;
2024
- readonly tools?: string;
2025
- readonly thinking?: string;
2026
- readonly args?: readonly string[];
2027
- readonly cwd?: string;
2028
- readonly timeoutMs?: number;
2029
- readonly logDir?: string;
2030
- readonly logFormat?: 'summary' | 'json';
2031
- /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
2032
- readonly streamLog?: false | 'raw' | 'summary';
2033
- readonly systemPrompt?: string;
2312
+ interface EvalSummary {
2313
+ /** Total number of test cases */
2314
+ readonly total: number;
2315
+ /** Number of passing test cases (score >= threshold) */
2316
+ readonly passed: number;
2317
+ /** Number of failing test cases (score < threshold) */
2318
+ readonly failed: number;
2319
+ /** Total duration in milliseconds */
2320
+ readonly durationMs: number;
2321
+ /** Mean score across all cases */
2322
+ readonly meanScore: number;
2034
2323
  }
2035
- interface ClaudeResolvedConfig {
2036
- readonly executable: string;
2037
- readonly model?: string;
2038
- readonly systemPrompt?: string;
2039
- readonly cwd?: string;
2040
- readonly timeoutMs?: number;
2041
- readonly maxTurns?: number;
2042
- readonly maxBudgetUsd?: number;
2043
- readonly logDir?: string;
2044
- readonly logFormat?: 'summary' | 'json';
2045
- /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
2046
- readonly streamLog?: false | 'raw' | 'summary';
2324
+ /**
2325
+ * Result of an `evaluate()` call.
2326
+ */
2327
+ interface EvalRunResult {
2328
+ /** Individual test case results */
2329
+ readonly results: readonly EvaluationResult[];
2330
+ /** Aggregate summary statistics */
2331
+ readonly summary: EvalSummary;
2047
2332
  }
2048
- interface MockResolvedConfig {
2049
- readonly response?: string;
2050
- readonly delayMs?: number;
2051
- readonly delayMinMs?: number;
2052
- readonly delayMaxMs?: number;
2333
+ /**
2334
+ * Run an evaluation suite against a target provider.
2335
+ *
2336
+ * Accepts either inline test definitions or a path to an EVAL.yaml spec file.
2337
+ * The config shape mirrors the YAML structure — users can translate between
2338
+ * file-based and programmatic usage 1:1.
2339
+ *
2340
+ * @param config - Evaluation configuration
2341
+ * @returns Typed evaluation results with summary statistics
2342
+ *
2343
+ * @example Inline tests with assertions
2344
+ * ```typescript
2345
+ * const { results, summary } = await evaluate({
2346
+ * tests: [
2347
+ * {
2348
+ * id: 'greeting',
2349
+ * input: 'Say hello',
2350
+ * assert: [{ type: 'contains', value: 'hello' }],
2351
+ * },
2352
+ * ],
2353
+ * target: { provider: 'mock_agent' },
2354
+ * });
2355
+ * console.log(`${summary.passed}/${summary.total} passed`);
2356
+ * ```
2357
+ *
2358
+ * @example Load from YAML
2359
+ * ```typescript
2360
+ * const { summary } = await evaluate({
2361
+ * specFile: './evals/my-eval.yaml',
2362
+ * filter: 'greeting-*',
2363
+ * });
2364
+ * ```
2365
+ */
2366
+ declare function evaluate(config: EvalConfig): Promise<EvalRunResult>;
2367
+
2368
+ interface TsEvalResult {
2369
+ readonly config: EvalConfig;
2370
+ readonly filePath: string;
2053
2371
  }
2054
- interface VSCodeResolvedConfig {
2055
- readonly executable: string;
2056
- readonly waitForResponse: boolean;
2057
- readonly dryRun: boolean;
2058
- readonly subagentRoot?: string;
2059
- readonly timeoutMs?: number;
2372
+ /**
2373
+ * Import a *.eval.ts file and extract the EvalConfig export.
2374
+ * Tries default, `config`, and `evalConfig` named exports in priority order.
2375
+ */
2376
+ declare function loadTsEvalFile(filePath: string): Promise<TsEvalResult>;
2377
+
2378
+ /**
2379
+ * EVAL.yaml → evals.json transpiler.
2380
+ *
2381
+ * Converts an AgentV EVAL.yaml file into Agent Skills evals.json format
2382
+ * for consumption by the skill-creator pipeline.
2383
+ *
2384
+ * Handles both `assertions:` (current) and `assert:` (deprecated alias).
2385
+ */
2386
+ interface EvalsJsonCase {
2387
+ id: number;
2388
+ prompt: string;
2389
+ expected_output?: string;
2390
+ files?: string[];
2391
+ should_trigger?: boolean;
2392
+ assertions: string[];
2060
2393
  }
2061
- interface AgentVResolvedConfig {
2062
- readonly model: string;
2063
- readonly temperature: number;
2394
+ interface EvalsJsonFile {
2395
+ skill_name: string;
2396
+ evals: EvalsJsonCase[];
2064
2397
  }
2065
- /** Base fields shared by all resolved targets. */
2066
- interface ResolvedTargetBase {
2067
- readonly name: string;
2068
- readonly graderTarget?: string;
2069
- readonly workers?: number;
2070
- readonly providerBatching?: boolean;
2071
- /**
2072
- * Whether this target can be executed via executor subagents in subagent mode.
2073
- * Defaults to `true` for all non-CLI providers. Set `false` in targets.yaml
2074
- * to force CLI invocation even in subagent mode.
2075
- */
2076
- readonly subagentModeAllowed?: boolean;
2077
- /**
2078
- * Ordered list of target names to try when the primary target fails after
2079
- * exhausting retries. Each fallback is attempted in order.
2080
- */
2081
- readonly fallbackTargets?: readonly string[];
2398
+ /**
2399
+ * Result of transpiling a single EVAL.yaml.
2400
+ * May produce multiple evals.json files (one per skill).
2401
+ */
2402
+ interface TranspileResult {
2403
+ /** Map from skill_name → EvalsJsonFile */
2404
+ files: Map<string, EvalsJsonFile>;
2405
+ /** Warning messages accumulated during transpilation */
2406
+ warnings: string[];
2082
2407
  }
2083
- type ResolvedTarget = (ResolvedTargetBase & {
2084
- readonly kind: 'openai';
2085
- readonly config: OpenAIResolvedConfig;
2086
- }) | (ResolvedTargetBase & {
2087
- readonly kind: 'openrouter';
2088
- readonly config: OpenRouterResolvedConfig;
2089
- }) | (ResolvedTargetBase & {
2090
- readonly kind: 'azure';
2091
- readonly config: AzureResolvedConfig;
2092
- }) | (ResolvedTargetBase & {
2093
- readonly kind: 'anthropic';
2094
- readonly config: AnthropicResolvedConfig;
2095
- }) | (ResolvedTargetBase & {
2096
- readonly kind: 'gemini';
2097
- readonly config: GeminiResolvedConfig;
2098
- }) | (ResolvedTargetBase & {
2099
- readonly kind: 'codex';
2100
- readonly config: CodexResolvedConfig;
2101
- }) | (ResolvedTargetBase & {
2102
- readonly kind: 'copilot-sdk';
2103
- readonly config: CopilotSdkResolvedConfig;
2104
- }) | (ResolvedTargetBase & {
2105
- readonly kind: 'copilot-cli';
2106
- readonly config: CopilotCliResolvedConfig;
2107
- }) | (ResolvedTargetBase & {
2108
- readonly kind: 'copilot-log';
2109
- readonly config: CopilotLogResolvedConfig;
2110
- }) | (ResolvedTargetBase & {
2111
- readonly kind: 'pi-coding-agent';
2112
- readonly config: PiCodingAgentResolvedConfig;
2113
- }) | (ResolvedTargetBase & {
2114
- readonly kind: 'pi-cli';
2115
- readonly config: PiCliResolvedConfig;
2116
- }) | (ResolvedTargetBase & {
2117
- readonly kind: 'claude';
2118
- readonly config: ClaudeResolvedConfig;
2119
- }) | (ResolvedTargetBase & {
2120
- readonly kind: 'claude-cli';
2121
- readonly config: ClaudeResolvedConfig;
2122
- }) | (ResolvedTargetBase & {
2123
- readonly kind: 'claude-sdk';
2124
- readonly config: ClaudeResolvedConfig;
2125
- }) | (ResolvedTargetBase & {
2126
- readonly kind: 'mock';
2127
- readonly config: MockResolvedConfig;
2128
- }) | (ResolvedTargetBase & {
2129
- readonly kind: 'vscode' | 'vscode-insiders';
2130
- readonly config: VSCodeResolvedConfig;
2131
- }) | (ResolvedTargetBase & {
2132
- readonly kind: 'agentv';
2133
- readonly config: AgentVResolvedConfig;
2134
- }) | (ResolvedTargetBase & {
2135
- readonly kind: 'cli';
2136
- readonly config: CliResolvedConfig;
2137
- }) | (ResolvedTargetBase & {
2138
- readonly kind: 'transcript';
2139
- readonly config: Record<string, never>;
2140
- });
2141
2408
  /**
2142
- * Optional settings accepted on ALL target definitions regardless of provider.
2143
- * Exported so the targets validator can reuse the same list — adding a field
2144
- * here automatically makes it valid in targets.yaml without a separate update.
2409
+ * Transpile a parsed EVAL.yaml object into one or more evals.json objects.
2410
+ *
2411
+ * @param suite Parsed YAML object (already loaded, no file I/O here)
2412
+ * @param source Source identifier for error messages (e.g. file path)
2145
2413
  */
2146
- declare const COMMON_TARGET_SETTINGS: readonly ["use_target", "provider_batching", "subagent_mode_allowed", "fallback_targets"];
2147
- declare function resolveDelegatedTargetDefinition(name: string, definitions: ReadonlyMap<string, TargetDefinition>, env?: EnvLookup): TargetDefinition | undefined;
2148
- declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string, options?: {
2149
- readonly emitDeprecationWarnings?: boolean;
2150
- }): ResolvedTarget;
2151
-
2414
+ declare function transpileEvalYaml(suite: unknown, source?: string): TranspileResult;
2152
2415
  /**
2153
- * Extensible provider registry.
2416
+ * Transpile an EVAL.yaml file into one or more evals.json objects.
2417
+ * Returns a map from output filename → JSON content.
2154
2418
  *
2155
- * Replaces the hardcoded switch/case dispatch in createProvider() with
2156
- * a registry of named factory functions. Built-in providers are registered
2157
- * at startup; users can add custom providers via the registry API or by
2158
- * dropping files in `.agentv/providers/`.
2419
+ * @param evalYamlPath Absolute path to the EVAL.yaml file
2420
+ */
2421
+ declare function transpileEvalYamlFile(evalYamlPath: string): TranspileResult;
2422
+ /**
2423
+ * Determine the output filename(s) for a transpile result.
2424
+ * Single skill → "evals.json"
2425
+ * Multiple skills → "<skill>.evals.json"
2159
2426
  */
2427
+ declare function getOutputFilenames(result: TranspileResult): Map<string, string>;
2160
2428
 
2429
+ declare function fileExists(filePath: string): Promise<boolean>;
2161
2430
  /**
2162
- * Factory function that creates a Provider instance from a resolved target.
2431
+ * Normalize line endings to LF (\n).
2432
+ * This ensures consistent behavior across Windows (CRLF) and Unix (LF) systems.
2163
2433
  */
2164
- type ProviderFactoryFn = (target: ResolvedTarget) => Provider;
2434
+ declare function normalizeLineEndings(content: string): string;
2165
2435
  /**
2166
- * Registry of provider factory functions keyed by provider kind.
2167
- *
2168
- * Built-in providers are registered at startup. Custom providers can be
2169
- * registered via the `register()` method.
2436
+ * Read a text file and normalize line endings to LF (\n).
2437
+ * This ensures consistent behavior across Windows (CRLF) and Unix (LF) systems.
2170
2438
  */
2171
- declare class ProviderRegistry {
2172
- private readonly factories;
2173
- /** Register a factory function for a provider kind. */
2174
- register(kind: string, factory: ProviderFactoryFn): this;
2175
- /** Get the factory function for a provider kind. */
2176
- get(kind: string): ProviderFactoryFn | undefined;
2177
- /** Check if a factory is registered for the given kind. */
2178
- has(kind: string): boolean;
2179
- /** List all registered provider kind names. */
2180
- list(): string[];
2181
- /**
2182
- * Create a provider instance from a resolved target.
2183
- * Falls back to CLI provider for unknown kinds (custom provider escape hatch).
2184
- */
2185
- create(target: ResolvedTarget): Provider;
2186
- }
2439
+ declare function readTextFile(filePath: string): Promise<string>;
2440
+ /**
2441
+ * Read a JSON file and parse it.
2442
+ */
2443
+ declare function readJsonFile<T = unknown>(filePath: string): Promise<T>;
2444
+ /**
2445
+ * Find git repository root by walking up the directory tree.
2446
+ */
2447
+ declare function findGitRoot(startPath: string): Promise<string | null>;
2448
+ /**
2449
+ * Build a chain of directories walking from a file's location up to repo root.
2450
+ * Used for discovering configuration files like targets.yaml or config.yaml.
2451
+ */
2452
+ declare function buildDirectoryChain(filePath: string, repoRoot: string): readonly string[];
2453
+ /**
2454
+ * Build search roots for file resolution, matching yaml-parser behavior.
2455
+ * Searches from eval file directory up to repo root.
2456
+ */
2457
+ declare function buildSearchRoots(evalPath: string, repoRoot: string): readonly string[];
2458
+ /**
2459
+ * Resolve a file reference using search roots, matching yaml-parser behavior.
2460
+ */
2461
+ declare function resolveFileReference(rawValue: string, searchRoots: readonly string[]): Promise<{
2462
+ readonly displayPath: string;
2463
+ readonly resolvedPath?: string;
2464
+ readonly attempted: readonly string[];
2465
+ }>;
2187
2466
 
2188
2467
  declare function readTargetDefinitions(filePath: string): Promise<readonly TargetDefinition[]>;
2189
2468
  declare function listTargetNames(definitions: readonly TargetDefinition[]): readonly string[];
@@ -2346,8 +2625,8 @@ interface EvaluationContext {
2346
2625
  readonly graderProvider?: Provider;
2347
2626
  /** @deprecated Use `graderProvider` instead */
2348
2627
  readonly judgeProvider?: Provider;
2349
- readonly evaluatorTemplateOverride?: string;
2350
- readonly evaluator?: EvaluatorConfig;
2628
+ readonly graderTemplateOverride?: string;
2629
+ readonly evaluator?: GraderConfig;
2351
2630
  /** Output messages from agent execution (primary source for tool trajectory) */
2352
2631
  readonly output?: readonly Message[];
2353
2632
  /** Lightweight summary of trace events (if available) */
@@ -2380,8 +2659,8 @@ interface EvaluationScore {
2380
2659
  readonly verdict: EvaluationVerdict;
2381
2660
  readonly assertions: readonly AssertionEntry[];
2382
2661
  readonly expectedAspectCount: number;
2383
- readonly evaluatorRawRequest?: JsonObject;
2384
- readonly scores?: readonly ChildEvaluatorResult[];
2662
+ readonly graderRawRequest?: JsonObject;
2663
+ readonly scores?: readonly ChildGraderResult[];
2385
2664
  /** Optional structured details from evaluators (e.g., TP/TN/FP/FN counts, alignments, per-turn scores). */
2386
2665
  readonly details?: JsonObject;
2387
2666
  /** Token usage from LLM calls made by this evaluator (optional). */
@@ -2389,26 +2668,26 @@ interface EvaluationScore {
2389
2668
  /** Target name used for grading (e.g., the LLM provider). */
2390
2669
  readonly graderTarget?: string;
2391
2670
  }
2392
- interface ChildEvaluatorResult {
2671
+ interface ChildGraderResult {
2393
2672
  readonly name: string;
2394
2673
  readonly type: string;
2395
2674
  readonly score: number;
2396
2675
  readonly weight?: number;
2397
2676
  readonly verdict: EvaluationVerdict;
2398
2677
  readonly assertions: readonly AssertionEntry[];
2399
- readonly evaluatorRawRequest?: JsonObject;
2400
- readonly scores?: readonly ChildEvaluatorResult[];
2678
+ readonly graderRawRequest?: JsonObject;
2679
+ readonly scores?: readonly ChildGraderResult[];
2401
2680
  /** Optional structured details from evaluators (e.g., TP/TN/FP/FN counts, alignments, per-turn scores). */
2402
2681
  readonly details?: JsonObject;
2403
2682
  /** Token usage from LLM calls made by this evaluator (optional). */
2404
2683
  readonly tokenUsage?: TokenUsage;
2405
2684
  }
2406
- interface Evaluator {
2685
+ interface Grader {
2407
2686
  readonly kind: string;
2408
2687
  evaluate(context: EvaluationContext): Promise<EvaluationScore> | EvaluationScore;
2409
2688
  }
2410
- interface EvaluatorFactory {
2411
- create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
2689
+ interface GraderFactory {
2690
+ create(config: GraderConfig, context: EvaluationContext): Grader;
2412
2691
  }
2413
2692
 
2414
2693
  /**
@@ -2447,7 +2726,7 @@ declare function deepEqual(a: unknown, b: unknown): boolean;
2447
2726
  */
2448
2727
  declare function negateScore(score: EvaluationScore): EvaluationScore;
2449
2728
 
2450
- interface CodeEvaluatorOptions {
2729
+ interface CodeGraderOptions {
2451
2730
  readonly command: readonly string[];
2452
2731
  /** @deprecated Use `command` instead */
2453
2732
  readonly script?: readonly string[];
@@ -2458,29 +2737,29 @@ interface CodeEvaluatorOptions {
2458
2737
  /** Target access config - when present, enables target invocation */
2459
2738
  readonly target?: TargetAccessConfig;
2460
2739
  }
2461
- declare class CodeEvaluator implements Evaluator {
2740
+ declare class CodeGrader implements Grader {
2462
2741
  readonly kind = "code-grader";
2463
2742
  private readonly command;
2464
2743
  private readonly cwd?;
2465
2744
  private readonly agentTimeoutMs?;
2466
2745
  private readonly config?;
2467
2746
  private readonly target?;
2468
- constructor(options: CodeEvaluatorOptions);
2747
+ constructor(options: CodeGraderOptions);
2469
2748
  evaluate(context: EvaluationContext): Promise<EvaluationScore>;
2470
2749
  }
2471
2750
  declare function executeScript(scriptPath: readonly string[] | string, input: string, agentTimeoutMs?: number, cwd?: string, env?: Record<string, string>): Promise<string>;
2472
2751
 
2473
- interface CompositeEvaluatorOptions {
2474
- readonly config: CompositeEvaluatorConfig;
2475
- readonly evaluatorFactory: EvaluatorFactory;
2752
+ interface CompositeGraderOptions {
2753
+ readonly config: CompositeGraderConfig;
2754
+ readonly evaluatorFactory: GraderFactory;
2476
2755
  readonly cwd?: string;
2477
2756
  }
2478
- declare class CompositeEvaluator implements Evaluator {
2757
+ declare class CompositeGrader implements Grader {
2479
2758
  readonly kind = "composite";
2480
2759
  private readonly config;
2481
2760
  private readonly evaluatorFactory;
2482
2761
  private readonly cwd?;
2483
- constructor(options: CompositeEvaluatorOptions);
2762
+ constructor(options: CompositeGraderOptions);
2484
2763
  evaluate(context: EvaluationContext): Promise<EvaluationScore>;
2485
2764
  private aggregate;
2486
2765
  private runWeightedAverage;
@@ -2489,50 +2768,50 @@ declare class CompositeEvaluator implements Evaluator {
2489
2768
  private runLlmAggregator;
2490
2769
  }
2491
2770
 
2492
- interface CostEvaluatorOptions {
2493
- readonly config: CostEvaluatorConfig;
2771
+ interface CostGraderOptions {
2772
+ readonly config: CostGraderConfig;
2494
2773
  }
2495
2774
  /**
2496
- * Evaluator that checks execution cost against a budget.
2775
+ * Grader that checks execution cost against a budget.
2497
2776
  * Uses costUsd from the evaluation context.
2498
2777
  */
2499
- declare class CostEvaluator implements Evaluator {
2778
+ declare class CostGrader implements Grader {
2500
2779
  readonly kind = "cost";
2501
2780
  private readonly config;
2502
- constructor(options: CostEvaluatorOptions);
2781
+ constructor(options: CostGraderOptions);
2503
2782
  evaluate(context: EvaluationContext): EvaluationScore;
2504
2783
  }
2505
2784
 
2506
- interface ExecutionMetricsEvaluatorOptions {
2507
- readonly config: ExecutionMetricsEvaluatorConfig;
2785
+ interface ExecutionMetricsGraderOptions {
2786
+ readonly config: ExecutionMetricsGraderConfig;
2508
2787
  }
2509
2788
  /**
2510
- * Evaluator that checks execution metrics against configured thresholds.
2789
+ * Grader that checks execution metrics against configured thresholds.
2511
2790
  * Supports multiple threshold types: tool calls, LLM calls, tokens, cost, duration,
2512
2791
  * and exploration ratio. Only specified thresholds are checked.
2513
2792
  *
2514
2793
  * Score is proportional: passed / total assertions
2515
2794
  */
2516
- declare class ExecutionMetricsEvaluator implements Evaluator {
2795
+ declare class ExecutionMetricsGrader implements Grader {
2517
2796
  readonly kind = "execution-metrics";
2518
2797
  private readonly config;
2519
- constructor(options: ExecutionMetricsEvaluatorOptions);
2798
+ constructor(options: ExecutionMetricsGraderOptions);
2520
2799
  evaluate(context: EvaluationContext): EvaluationScore;
2521
2800
  private extractConfiguredThresholds;
2522
2801
  private filterDefinedMetrics;
2523
2802
  }
2524
2803
 
2525
- interface FieldAccuracyEvaluatorOptions {
2526
- readonly config: FieldAccuracyEvaluatorConfig;
2804
+ interface FieldAccuracyGraderOptions {
2805
+ readonly config: FieldAccuracyGraderConfig;
2527
2806
  }
2528
2807
  /**
2529
- * FieldAccuracyEvaluator compares extracted structured data against expected values
2808
+ * FieldAccuracyGrader compares extracted structured data against expected values
2530
2809
  * with configurable matching strategies (exact, fuzzy, numeric_tolerance, date).
2531
2810
  */
2532
- declare class FieldAccuracyEvaluator implements Evaluator {
2811
+ declare class FieldAccuracyGrader implements Grader {
2533
2812
  readonly kind = "field-accuracy";
2534
2813
  private readonly config;
2535
- constructor(options: FieldAccuracyEvaluatorOptions);
2814
+ constructor(options: FieldAccuracyGraderOptions);
2536
2815
  evaluate(context: EvaluationContext): EvaluationScore;
2537
2816
  /**
2538
2817
  * Extract expected data from expected_output array.
@@ -2561,33 +2840,33 @@ declare class FieldAccuracyEvaluator implements Evaluator {
2561
2840
  private aggregateResults;
2562
2841
  }
2563
2842
 
2564
- interface LatencyEvaluatorOptions {
2565
- readonly config: LatencyEvaluatorConfig;
2843
+ interface LatencyGraderOptions {
2844
+ readonly config: LatencyGraderConfig;
2566
2845
  }
2567
2846
  /**
2568
- * Evaluator that checks execution duration against a threshold.
2847
+ * Grader that checks execution duration against a threshold.
2569
2848
  * Uses durationMs from the evaluation context.
2570
2849
  */
2571
- declare class LatencyEvaluator implements Evaluator {
2850
+ declare class LatencyGrader implements Grader {
2572
2851
  readonly kind = "latency";
2573
2852
  private readonly config;
2574
- constructor(options: LatencyEvaluatorOptions);
2853
+ constructor(options: LatencyGraderOptions);
2575
2854
  evaluate(context: EvaluationContext): EvaluationScore;
2576
2855
  }
2577
2856
 
2578
2857
  /**
2579
- * Default evaluator template for the user prompt (variables will be substituted).
2580
- * Custom evaluators can override this via evaluatorTemplate option.
2858
+ * Default grader template for the user prompt (variables will be substituted).
2859
+ * Custom graders can override this via graderTemplate option.
2581
2860
  */
2582
- declare const DEFAULT_EVALUATOR_TEMPLATE: string;
2861
+ declare const DEFAULT_GRADER_TEMPLATE: string;
2583
2862
  type GraderProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
2584
- interface LlmGraderEvaluatorOptions {
2863
+ interface LlmGraderOptions {
2585
2864
  readonly resolveGraderProvider: GraderProviderResolver;
2586
2865
  /** @deprecated Use `resolveGraderProvider` instead. */
2587
2866
  readonly resolveJudgeProvider?: GraderProviderResolver;
2588
2867
  readonly maxOutputTokens?: number;
2589
2868
  readonly temperature?: number;
2590
- readonly evaluatorTemplate?: string;
2869
+ readonly graderTemplate?: string;
2591
2870
  readonly maxSteps?: number;
2592
2871
  readonly graderTargetProvider?: Provider;
2593
2872
  /** @deprecated Use `graderTargetProvider` instead. */
@@ -2633,39 +2912,39 @@ declare const rubricEvaluationSchema: z.ZodObject<{
2633
2912
  reasoning: z.ZodString;
2634
2913
  }, "strip", z.ZodTypeAny, {
2635
2914
  id: string;
2636
- reasoning: string;
2637
2915
  satisfied: boolean;
2916
+ reasoning: string;
2638
2917
  }, {
2639
2918
  id: string;
2640
- reasoning: string;
2641
2919
  satisfied: boolean;
2920
+ reasoning: string;
2642
2921
  }>, "many">;
2643
2922
  overall_reasoning: z.ZodString;
2644
2923
  }, "strip", z.ZodTypeAny, {
2645
2924
  checks: {
2646
2925
  id: string;
2647
- reasoning: string;
2648
2926
  satisfied: boolean;
2927
+ reasoning: string;
2649
2928
  }[];
2650
2929
  overall_reasoning: string;
2651
2930
  }, {
2652
2931
  checks: {
2653
2932
  id: string;
2654
- reasoning: string;
2655
2933
  satisfied: boolean;
2934
+ reasoning: string;
2656
2935
  }[];
2657
2936
  overall_reasoning: string;
2658
2937
  }>;
2659
2938
 
2660
- declare class LlmGraderEvaluator implements Evaluator {
2939
+ declare class LlmGrader implements Grader {
2661
2940
  readonly kind = "llm-grader";
2662
2941
  private readonly resolveGraderProvider;
2663
2942
  private readonly maxOutputTokens?;
2664
2943
  private readonly temperature?;
2665
- private readonly evaluatorTemplate?;
2944
+ private readonly graderTemplate?;
2666
2945
  private readonly maxSteps;
2667
2946
  private readonly graderTargetProvider?;
2668
- constructor(options: LlmGraderEvaluatorOptions);
2947
+ constructor(options: LlmGraderOptions);
2669
2948
  evaluate(context: EvaluationContext): Promise<EvaluationScore>;
2670
2949
  private prepareContext;
2671
2950
  private evaluateFreeform;
@@ -2722,7 +3001,7 @@ declare class LlmGraderEvaluator implements Evaluator {
2722
3001
  }
2723
3002
  /**
2724
3003
  * Build the mandatory output schema that all evaluators must follow.
2725
- * This schema is always appended to the evaluator template.
3004
+ * This schema is always appended to the grader template.
2726
3005
  */
2727
3006
  declare function buildOutputSchema(): string;
2728
3007
  declare function buildRubricOutputSchema(): string;
@@ -2766,10 +3045,10 @@ declare function extractImageBlocks(messages: readonly Message[]): ContentImage[
2766
3045
  * names (input.skill, input.file_path) regardless of provider.
2767
3046
  */
2768
3047
 
2769
- declare class SkillTriggerEvaluator implements Evaluator {
3048
+ declare class SkillTriggerGrader implements Grader {
2770
3049
  readonly kind = "skill-trigger";
2771
3050
  private readonly config;
2772
- constructor(config: SkillTriggerEvaluatorConfig);
3051
+ constructor(config: SkillTriggerGraderConfig);
2773
3052
  evaluate(context: EvaluationContext): EvaluationScore;
2774
3053
  }
2775
3054
 
@@ -2783,33 +3062,33 @@ declare function assembleLlmGraderPrompt(input: {
2783
3062
  evalCase: EvalTest;
2784
3063
  candidate: string;
2785
3064
  promptInputs: PromptInputs;
2786
- evaluatorConfig?: LlmGraderEvaluatorConfig;
3065
+ evaluatorConfig?: LlmGraderConfig;
2787
3066
  output?: readonly Message[];
2788
3067
  fileChanges?: string;
2789
- evaluatorTemplateOverride?: string;
3068
+ graderTemplateOverride?: string;
2790
3069
  }): LlmGraderPromptAssembly;
2791
3070
 
2792
- interface TokenUsageEvaluatorOptions {
2793
- readonly config: TokenUsageEvaluatorConfig;
3071
+ interface TokenUsageGraderOptions {
3072
+ readonly config: TokenUsageGraderConfig;
2794
3073
  }
2795
3074
  /**
2796
- * Evaluator that checks provider-reported token usage against configured limits.
3075
+ * Grader that checks provider-reported token usage against configured limits.
2797
3076
  * Uses tokenUsage from the evaluation context.
2798
3077
  */
2799
- declare class TokenUsageEvaluator implements Evaluator {
3078
+ declare class TokenUsageGrader implements Grader {
2800
3079
  readonly kind = "token-usage";
2801
3080
  private readonly config;
2802
- constructor(options: TokenUsageEvaluatorOptions);
3081
+ constructor(options: TokenUsageGraderOptions);
2803
3082
  evaluate(context: EvaluationContext): EvaluationScore;
2804
3083
  }
2805
3084
 
2806
- interface ToolTrajectoryEvaluatorOptions {
2807
- readonly config: ToolTrajectoryEvaluatorConfig;
3085
+ interface ToolTrajectoryGraderOptions {
3086
+ readonly config: ToolTrajectoryGraderConfig;
2808
3087
  }
2809
- declare class ToolTrajectoryEvaluator implements Evaluator {
3088
+ declare class ToolTrajectoryGrader implements Grader {
2810
3089
  readonly kind = "tool-trajectory";
2811
3090
  private readonly config;
2812
- constructor(options: ToolTrajectoryEvaluatorOptions);
3091
+ constructor(options: ToolTrajectoryGraderOptions);
2813
3092
  evaluate(context: EvaluationContext): EvaluationScore;
2814
3093
  /**
2815
3094
  * Extract tool calls from output messages.
@@ -2873,7 +3152,7 @@ declare function runIsJsonAssertion(output: string): AssertionResult;
2873
3152
  declare function runEqualsAssertion(output: string, value: string): AssertionResult;
2874
3153
 
2875
3154
  /**
2876
- * Extensible evaluator registry.
3155
+ * Extensible grader registry.
2877
3156
  *
2878
3157
  * Replaces the hardcoded switch/case dispatch in the orchestrator with
2879
3158
  * a registry of named factory functions. Built-in evaluators are registered
@@ -2882,10 +3161,10 @@ declare function runEqualsAssertion(output: string, value: string): AssertionRes
2882
3161
  */
2883
3162
 
2884
3163
  /**
2885
- * Context passed to evaluator factory functions during creation.
3164
+ * Context passed to grader factory functions during creation.
2886
3165
  * Contains shared resources needed by evaluator instances.
2887
3166
  */
2888
- interface EvaluatorDispatchContext {
3167
+ interface GraderDispatchContext {
2889
3168
  /** Shared LLM grader provider (resolved at suite level) */
2890
3169
  readonly graderProvider?: Provider;
2891
3170
  /** @deprecated Use `graderProvider` instead */
@@ -2899,48 +3178,48 @@ interface EvaluatorDispatchContext {
2899
3178
  /** Directory containing the eval file (for composite member resolution) */
2900
3179
  readonly evalFileDir?: string;
2901
3180
  /** Shared LLM grader evaluator instance */
2902
- readonly llmGrader: Evaluator;
3181
+ readonly llmGrader: Grader;
2903
3182
  /** @deprecated Use `llmGrader` instead */
2904
- readonly llmJudge?: Evaluator;
3183
+ readonly llmJudge?: Grader;
2905
3184
  /** Reference to the registry itself (for composite evaluators that need to create children) */
2906
- readonly registry: EvaluatorRegistry;
3185
+ readonly registry: GraderRegistry;
2907
3186
  }
2908
3187
  /**
2909
- * Factory function that creates an Evaluator instance from a config.
3188
+ * Factory function that creates an Grader instance from a config.
2910
3189
  *
2911
3190
  * Factory functions handle all type-specific initialization logic:
2912
3191
  * - Reading prompt files for LLM graders
2913
3192
  * - Resolving script paths for code graders
2914
3193
  * - Creating adapter evaluators for deterministic assertions
2915
3194
  */
2916
- type EvaluatorFactoryFn = (config: EvaluatorConfig, context: EvaluatorDispatchContext) => Evaluator | Promise<Evaluator>;
3195
+ type GraderFactoryFn = (config: GraderConfig, context: GraderDispatchContext) => Grader | Promise<Grader>;
2917
3196
  /**
2918
- * Registry of evaluator factory functions keyed by evaluator type name.
3197
+ * Registry of grader factory functions keyed by grader type name.
2919
3198
  *
2920
3199
  * Built-in evaluators are registered at startup. Custom evaluators can be
2921
3200
  * registered via the `register()` method or discovered from `.agentv/assertions/`.
2922
3201
  */
2923
- declare class EvaluatorRegistry {
3202
+ declare class GraderRegistry {
2924
3203
  private readonly factories;
2925
- /** Register a factory function for an evaluator type. */
2926
- register(type: string, factory: EvaluatorFactoryFn): this;
2927
- /** Get the factory function for an evaluator type. */
2928
- get(type: string): EvaluatorFactoryFn | undefined;
3204
+ /** Register a factory function for an grader type. */
3205
+ register(type: string, factory: GraderFactoryFn): this;
3206
+ /** Get the factory function for an grader type. */
3207
+ get(type: string): GraderFactoryFn | undefined;
2929
3208
  /** Check if a factory is registered for the given type. */
2930
3209
  has(type: string): boolean;
2931
- /** List all registered evaluator type names. */
3210
+ /** List all registered grader type names. */
2932
3211
  list(): string[];
2933
3212
  /**
2934
3213
  * Create an evaluator instance from a config, using the registered factory.
2935
- * Throws if no factory is registered for the evaluator type.
3214
+ * Throws if no factory is registered for the grader type.
2936
3215
  */
2937
- create(config: EvaluatorConfig, context: EvaluatorDispatchContext): Promise<Evaluator>;
3216
+ create(config: GraderConfig, context: GraderDispatchContext): Promise<Grader>;
2938
3217
  }
2939
3218
  /**
2940
- * Adapter that wraps a synchronous assertion function as an Evaluator.
3219
+ * Adapter that wraps a synchronous assertion function as an Grader.
2941
3220
  * Used for deterministic assertions (contains, regex, is-json, equals).
2942
3221
  */
2943
- declare class DeterministicAssertionEvaluator implements Evaluator {
3222
+ declare class DeterministicAssertionGrader implements Grader {
2944
3223
  private readonly assertFn;
2945
3224
  readonly kind: string;
2946
3225
  constructor(kind: string, assertFn: (context: EvaluationContext) => EvaluationScore);
@@ -2988,8 +3267,8 @@ interface RunEvalCaseOptions {
2988
3267
  readonly evalCase: EvalTest;
2989
3268
  readonly provider: Provider;
2990
3269
  readonly target: ResolvedTarget;
2991
- readonly evaluators: Partial<Record<string, Evaluator>> & {
2992
- readonly 'llm-grader': Evaluator;
3270
+ readonly evaluators: Partial<Record<string, Grader>> & {
3271
+ readonly 'llm-grader': Grader;
2993
3272
  };
2994
3273
  readonly now?: () => Date;
2995
3274
  readonly maxRetries?: number;
@@ -3020,8 +3299,8 @@ interface RunEvalCaseOptions {
3020
3299
  readonly suiteWorkspaceFile?: string;
3021
3300
  /** Real-time observability callbacks passed to the provider */
3022
3301
  readonly streamCallbacks?: ProviderStreamCallbacks;
3023
- /** Evaluator type registry (with custom assertions discovered) */
3024
- readonly typeRegistry?: EvaluatorRegistry;
3302
+ /** Grader type registry (with custom assertions discovered) */
3303
+ readonly typeRegistry?: GraderRegistry;
3025
3304
  /** RepoManager instance for repo lifecycle (shared workspace mode) */
3026
3305
  readonly repoManager?: RepoManager;
3027
3306
  /** Directory containing the eval YAML file. Used as default cwd for workspace scripts. */
@@ -3054,7 +3333,7 @@ interface RunEvaluationOptions {
3054
3333
  readonly targets?: readonly TargetDefinition[];
3055
3334
  readonly env?: EnvLookup;
3056
3335
  readonly providerFactory?: (target: ResolvedTarget) => Provider;
3057
- readonly evaluators?: Partial<Record<string, Evaluator>>;
3336
+ readonly evaluators?: Partial<Record<string, Grader>>;
3058
3337
  readonly maxRetries?: number;
3059
3338
  readonly agentTimeoutMs?: number;
3060
3339
  readonly cache?: EvaluationCache;
@@ -3076,7 +3355,7 @@ interface RunEvaluationOptions {
3076
3355
  /** Real-time observability callbacks passed to the provider */
3077
3356
  readonly streamCallbacks?: ProviderStreamCallbacks;
3078
3357
  /** Suite-level total cost budget in USD (stops dispatching when exceeded) */
3079
- readonly totalBudgetUsd?: number;
3358
+ readonly budgetUsd?: number;
3080
3359
  /** Execution error tolerance: true halts on first error */
3081
3360
  readonly failOnError?: FailOnError;
3082
3361
  /** Workspace pooling: true (default) enables pool, false disables, undefined defaults to true */
@@ -3107,244 +3386,6 @@ interface RunEvaluationOptions {
3107
3386
  declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
3108
3387
  declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
3109
3388
 
3110
- /**
3111
- * Types for inline assertion functions used in the evaluate() API.
3112
- *
3113
- * Inline functions are the escape hatch for custom evaluation logic
3114
- * that doesn't fit a built-in evaluator type. For built-in assertions
3115
- * (contains, regex, is-json, etc.), use config objects instead:
3116
- *
3117
- * assert: [{ type: 'contains', value: 'hello' }]
3118
- *
3119
- * Inline functions are for custom logic:
3120
- *
3121
- * assert: [({ output }) => ({ name: 'len', score: output.length > 5 ? 1 : 0 })]
3122
- */
3123
- /** Context passed to inline assertion functions */
3124
- interface AssertContext {
3125
- readonly input: string;
3126
- readonly output: string;
3127
- readonly expectedOutput?: string;
3128
- readonly criteria?: string;
3129
- readonly metadata?: Record<string, unknown>;
3130
- }
3131
- /** Result from an inline assertion function */
3132
- interface AssertResult {
3133
- readonly name: string;
3134
- readonly score: number;
3135
- readonly metadata?: Record<string, unknown>;
3136
- }
3137
- /** Inline assertion function signature */
3138
- type AssertFn = (ctx: AssertContext) => AssertResult | Promise<AssertResult>;
3139
-
3140
- /**
3141
- * Programmatic API for running evaluations.
3142
- *
3143
- * Provides `evaluate()` — a high-level function for using AgentV as a library
3144
- * instead of a CLI. The config shape mirrors the YAML structure for easy
3145
- * translation between file-based and programmatic usage.
3146
- *
3147
- * @example Inline tests with config objects
3148
- * ```typescript
3149
- * import { evaluate } from '@agentv/core';
3150
- *
3151
- * const results = await evaluate({
3152
- * tests: [
3153
- * {
3154
- * id: 'capital',
3155
- * input: 'What is the capital of France?',
3156
- * expectedOutput: 'Paris',
3157
- * assert: [{ type: 'contains', value: 'Paris' }],
3158
- * },
3159
- * ],
3160
- * target: { provider: 'mock_agent' },
3161
- * });
3162
- *
3163
- * console.log(results.summary.passed, 'passed');
3164
- * ```
3165
- *
3166
- * @example Inline tests with task function and custom assertion
3167
- * ```typescript
3168
- * import { evaluate } from '@agentv/core';
3169
- *
3170
- * const { summary } = await evaluate({
3171
- * tests: [
3172
- * {
3173
- * id: 'echo',
3174
- * input: 'hello',
3175
- * expectedOutput: 'Echo: hello',
3176
- * assert: [
3177
- * { type: 'contains', value: 'hello' },
3178
- * { type: 'equals' },
3179
- * ({ output }) => ({ name: 'custom', score: output.length > 0 ? 1 : 0 }),
3180
- * ],
3181
- * },
3182
- * ],
3183
- * task: async (input) => `Echo: ${input}`,
3184
- * });
3185
- * ```
3186
- *
3187
- * @example File-based
3188
- * ```typescript
3189
- * const results = await evaluate({
3190
- * specFile: './evals/EVAL.yaml',
3191
- * target: { provider: 'claude_agent' },
3192
- * });
3193
- * ```
3194
- *
3195
- * @module
3196
- */
3197
-
3198
- /**
3199
- * Inline test definition for the programmatic API.
3200
- * Mirrors the YAML test structure.
3201
- */
3202
- interface EvalTestInput {
3203
- /** Unique test identifier */
3204
- readonly id: string;
3205
- /** What the response should accomplish */
3206
- readonly criteria?: string;
3207
- /** Input to the agent (string or message array) */
3208
- readonly input: string | readonly {
3209
- role: string;
3210
- content: string;
3211
- }[];
3212
- /** Expected reference output (camelCase preferred) */
3213
- readonly expectedOutput?: string;
3214
- /** @deprecated Use `expectedOutput` instead */
3215
- readonly expected_output?: string;
3216
- /** Assertion graders — accepts factory functions, config objects, or inline functions */
3217
- readonly assert?: readonly AssertEntry[];
3218
- /** Arbitrary metadata */
3219
- readonly metadata?: Record<string, unknown>;
3220
- }
3221
- /**
3222
- * Inline assertion definition for the programmatic API.
3223
- * Matches the YAML `assert` block structure.
3224
- */
3225
- interface EvalAssertionInput {
3226
- /** Assertion type (e.g., 'contains', 'llm-grader', 'code-grader') */
3227
- readonly type: string;
3228
- /** Display name */
3229
- readonly name?: string;
3230
- /** Value for deterministic assertions (contains, equals, regex) */
3231
- readonly value?: string;
3232
- /** Weight for scoring */
3233
- readonly weight?: number;
3234
- /** Whether this assertion is required to pass */
3235
- readonly required?: boolean | number;
3236
- /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
3237
- readonly min_score?: number;
3238
- /** Prompt file for llm_grader */
3239
- readonly prompt?: string;
3240
- /** Script for code_grader */
3241
- readonly script?: string | readonly string[];
3242
- /** Additional config passed to the assertion */
3243
- readonly config?: Record<string, unknown>;
3244
- /** Nested assertions for composite type */
3245
- readonly assert?: readonly EvalAssertionInput[];
3246
- /** Rubric criteria for rubrics type */
3247
- readonly criteria?: readonly (string | {
3248
- id?: string;
3249
- outcome: string;
3250
- weight?: number;
3251
- })[];
3252
- /** Additional properties */
3253
- readonly [key: string]: unknown;
3254
- }
3255
- /** Assert entry: inline function or config object */
3256
- type AssertEntry = AssertFn | EvalAssertionInput;
3257
- /**
3258
- * Configuration for `evaluate()`.
3259
- * Accepts either inline tests or a spec file path.
3260
- */
3261
- interface EvalConfig {
3262
- /** Inline test definitions (mutually exclusive with specFile) */
3263
- readonly tests?: readonly EvalTestInput[];
3264
- /** Path to an EVAL.yaml spec file (mutually exclusive with tests) */
3265
- readonly specFile?: string;
3266
- /** Target provider configuration */
3267
- readonly target?: TargetDefinition;
3268
- /** Custom task function — mutually exclusive with target */
3269
- readonly task?: (input: string) => string | Promise<string>;
3270
- /** Suite-level assertions applied to all tests */
3271
- readonly assert?: readonly AssertEntry[];
3272
- /** Filter tests by ID pattern(s) (glob supported). Arrays use OR logic. */
3273
- readonly filter?: string | readonly string[];
3274
- /** Maximum concurrent workers (default: 3) */
3275
- readonly workers?: number;
3276
- /** Maximum retries on failure (default: 2) */
3277
- readonly maxRetries?: number;
3278
- /** Agent timeout in milliseconds. No timeout if not set. */
3279
- readonly agentTimeoutMs?: number;
3280
- /** Enable response caching */
3281
- readonly cache?: boolean;
3282
- /** Verbose logging */
3283
- readonly verbose?: boolean;
3284
- /** Callback for each completed result */
3285
- readonly onResult?: (result: EvaluationResult) => void;
3286
- /** Score threshold for pass/fail (0-1). Default: 0.8 (DEFAULT_THRESHOLD). */
3287
- readonly threshold?: number;
3288
- }
3289
- /**
3290
- * Summary statistics for an evaluation run.
3291
- */
3292
- interface EvalSummary {
3293
- /** Total number of test cases */
3294
- readonly total: number;
3295
- /** Number of passing test cases (score >= threshold) */
3296
- readonly passed: number;
3297
- /** Number of failing test cases (score < threshold) */
3298
- readonly failed: number;
3299
- /** Total duration in milliseconds */
3300
- readonly durationMs: number;
3301
- /** Mean score across all cases */
3302
- readonly meanScore: number;
3303
- }
3304
- /**
3305
- * Result of an `evaluate()` call.
3306
- */
3307
- interface EvalRunResult {
3308
- /** Individual test case results */
3309
- readonly results: readonly EvaluationResult[];
3310
- /** Aggregate summary statistics */
3311
- readonly summary: EvalSummary;
3312
- }
3313
- /**
3314
- * Run an evaluation suite against a target provider.
3315
- *
3316
- * Accepts either inline test definitions or a path to an EVAL.yaml spec file.
3317
- * The config shape mirrors the YAML structure — users can translate between
3318
- * file-based and programmatic usage 1:1.
3319
- *
3320
- * @param config - Evaluation configuration
3321
- * @returns Typed evaluation results with summary statistics
3322
- *
3323
- * @example Inline tests with assertions
3324
- * ```typescript
3325
- * const { results, summary } = await evaluate({
3326
- * tests: [
3327
- * {
3328
- * id: 'greeting',
3329
- * input: 'Say hello',
3330
- * assert: [{ type: 'contains', value: 'hello' }],
3331
- * },
3332
- * ],
3333
- * target: { provider: 'mock_agent' },
3334
- * });
3335
- * console.log(`${summary.passed}/${summary.total} passed`);
3336
- * ```
3337
- *
3338
- * @example Load from YAML
3339
- * ```typescript
3340
- * const { summary } = await evaluate({
3341
- * specFile: './evals/my-eval.yaml',
3342
- * filter: 'greeting-*',
3343
- * });
3344
- * ```
3345
- */
3346
- declare function evaluate(config: EvalConfig): Promise<EvalRunResult>;
3347
-
3348
3389
  /**
3349
3390
  * Typed configuration file support for AgentV.
3350
3391
  *
@@ -4186,17 +4227,17 @@ declare class OtlpJsonFileExporter {
4186
4227
  }
4187
4228
 
4188
4229
  /**
4189
- * Factory functions for all built-in evaluator types.
4230
+ * Factory functions for all built-in grader types.
4190
4231
  *
4191
- * Each factory creates an Evaluator instance from an EvaluatorConfig,
4232
+ * Each factory creates an Grader instance from an GraderConfig,
4192
4233
  * handling type-specific initialization logic. These are registered into
4193
- * the EvaluatorRegistry at startup.
4234
+ * the GraderRegistry at startup.
4194
4235
  */
4195
4236
 
4196
4237
  /**
4197
- * Create a new EvaluatorRegistry with all built-in evaluator types registered.
4238
+ * Create a new GraderRegistry with all built-in grader types registered.
4198
4239
  */
4199
- declare function createBuiltinRegistry(): EvaluatorRegistry;
4240
+ declare function createBuiltinRegistry(): GraderRegistry;
4200
4241
 
4201
4242
  /**
4202
4243
  * Convention-based discovery of custom assertion scripts.
@@ -4216,27 +4257,27 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
4216
4257
  * @param baseDir - The base directory to search from (typically project root or eval file dir)
4217
4258
  * @returns Names of discovered assertion types
4218
4259
  */
4219
- declare function discoverAssertions(registry: EvaluatorRegistry, baseDir: string): Promise<string[]>;
4260
+ declare function discoverAssertions(registry: GraderRegistry, baseDir: string): Promise<string[]>;
4220
4261
 
4221
4262
  /**
4222
4263
  * Convention-based discovery of custom grader scripts.
4223
4264
  *
4224
4265
  * Scans `.agentv/graders/` (and legacy `.agentv/judges/`) for TypeScript/JavaScript
4225
- * files and registers them as code-grader evaluators in the registry. The file name
4226
- * (without extension) becomes the evaluator type name.
4266
+ * files and registers them as code graders in the registry. The file name
4267
+ * (without extension) becomes the grader type name.
4227
4268
  *
4228
4269
  * Example: `.agentv/graders/custom-grader.ts` → type "custom-grader" in EVAL.yaml
4229
4270
  */
4230
4271
 
4231
4272
  /**
4232
4273
  * Discover custom grader scripts from `.agentv/graders/` (and legacy `.agentv/judges/`)
4233
- * and register them as evaluator types in the registry.
4274
+ * and register them as grader types in the registry.
4234
4275
  *
4235
- * @param registry - The evaluator registry to register discovered graders into
4276
+ * @param registry - The grader registry to register discovered graders into
4236
4277
  * @param baseDir - The base directory to search from (typically project root or eval file dir)
4237
4278
  * @returns Names of discovered grader types
4238
4279
  */
4239
- declare function discoverGraders(registry: EvaluatorRegistry, baseDir: string): Promise<string[]>;
4280
+ declare function discoverGraders(registry: GraderRegistry, baseDir: string): Promise<string[]>;
4240
4281
 
4241
4282
  /**
4242
4283
  * Core types for the transcript import pipeline.
@@ -4489,7 +4530,7 @@ declare function discoverClaudeSessions(opts?: ClaudeDiscoverOptions): Promise<C
4489
4530
  * 1. Reads a transcript JSONL file (produced by `agentv import`)
4490
4531
  * 2. Each invocation pops the next line from the transcript
4491
4532
  * 3. Returns a ProviderResponse with pre-populated output, token usage, etc.
4492
- * 4. Evaluators run identically to live eval — they see the same ProviderResponse
4533
+ * 4. Graders run identically to live eval — they see the same ProviderResponse
4493
4534
  *
4494
4535
  * The provider name in results is set to the source provider from the transcript
4495
4536
  * (e.g., "claude", "codex", "copilot").
@@ -4555,4 +4596,4 @@ type AgentKernel = {
4555
4596
  };
4556
4597
  declare function createAgentKernel(): AgentKernel;
4557
4598
 
4558
- export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
4599
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };