@agentv/core 4.6.1 → 4.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-ZK4GG7PR.js → chunk-VCVVKCC4.js} +268 -128
- package/dist/chunk-VCVVKCC4.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +110 -94
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +30 -71
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1353 -466
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +301 -74
- package/dist/index.d.ts +301 -74
- package/dist/index.js +1053 -311
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-ZK4GG7PR.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -73,7 +73,7 @@ interface ChatMessage {
|
|
|
73
73
|
readonly name?: string;
|
|
74
74
|
}
|
|
75
75
|
type ChatPrompt = readonly ChatMessage[];
|
|
76
|
-
type ProviderKind = 'openai' | 'openrouter' | 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-sdk' | 'copilot-cli' | 'copilot-log' | 'pi-coding-agent' | 'pi-cli' | 'claude' | 'claude-cli' | 'claude-sdk' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders' | 'agentv';
|
|
76
|
+
type ProviderKind = 'openai' | 'openrouter' | 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-sdk' | 'copilot-cli' | 'copilot-log' | 'pi-coding-agent' | 'pi-cli' | 'claude' | 'claude-cli' | 'claude-sdk' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders' | 'agentv' | 'transcript';
|
|
77
77
|
/** Callbacks for real-time observability during provider execution */
|
|
78
78
|
interface ProviderStreamCallbacks {
|
|
79
79
|
onToolCallStart?: (toolName: string, toolCallId?: string) => void;
|
|
@@ -222,25 +222,19 @@ interface TargetDefinition {
|
|
|
222
222
|
readonly judge_target?: string | undefined;
|
|
223
223
|
readonly workers?: number | undefined;
|
|
224
224
|
readonly provider_batching?: boolean | undefined;
|
|
225
|
-
readonly
|
|
225
|
+
readonly subagent_mode_allowed?: boolean | undefined;
|
|
226
226
|
readonly endpoint?: string | unknown | undefined;
|
|
227
227
|
readonly base_url?: string | unknown | undefined;
|
|
228
|
-
readonly baseUrl?: string | unknown | undefined;
|
|
229
228
|
readonly resource?: string | unknown | undefined;
|
|
230
|
-
readonly resourceName?: string | unknown | undefined;
|
|
231
229
|
readonly api_key?: string | unknown | undefined;
|
|
232
|
-
readonly apiKey?: string | unknown | undefined;
|
|
233
230
|
readonly deployment?: string | unknown | undefined;
|
|
234
|
-
readonly deploymentName?: string | unknown | undefined;
|
|
235
231
|
readonly model?: string | unknown | undefined;
|
|
236
232
|
readonly version?: string | unknown | undefined;
|
|
237
233
|
readonly api_version?: string | unknown | undefined;
|
|
238
234
|
readonly variant?: string | unknown | undefined;
|
|
239
235
|
readonly thinking_budget?: number | unknown | undefined;
|
|
240
|
-
readonly thinkingBudget?: number | unknown | undefined;
|
|
241
236
|
readonly temperature?: number | unknown | undefined;
|
|
242
237
|
readonly max_output_tokens?: number | unknown | undefined;
|
|
243
|
-
readonly maxTokens?: number | unknown | undefined;
|
|
244
238
|
readonly executable?: string | unknown | undefined;
|
|
245
239
|
readonly command?: string | unknown | undefined;
|
|
246
240
|
readonly binary?: string | unknown | undefined;
|
|
@@ -248,63 +242,36 @@ interface TargetDefinition {
|
|
|
248
242
|
readonly arguments?: unknown | undefined;
|
|
249
243
|
readonly cwd?: string | unknown | undefined;
|
|
250
244
|
readonly timeout_seconds?: number | unknown | undefined;
|
|
251
|
-
readonly timeoutSeconds?: number | unknown | undefined;
|
|
252
245
|
readonly log_dir?: string | unknown | undefined;
|
|
253
|
-
readonly logDir?: string | unknown | undefined;
|
|
254
246
|
readonly log_directory?: string | unknown | undefined;
|
|
255
|
-
readonly logDirectory?: string | unknown | undefined;
|
|
256
247
|
readonly log_format?: string | unknown | undefined;
|
|
257
|
-
readonly logFormat?: string | unknown | undefined;
|
|
258
248
|
readonly log_output_format?: string | unknown | undefined;
|
|
259
|
-
readonly logOutputFormat?: string | unknown | undefined;
|
|
260
249
|
readonly system_prompt?: string | unknown | undefined;
|
|
261
|
-
readonly systemPrompt?: string | unknown | undefined;
|
|
262
250
|
readonly max_turns?: number | unknown | undefined;
|
|
263
|
-
readonly maxTurns?: number | unknown | undefined;
|
|
264
251
|
readonly max_budget_usd?: number | unknown | undefined;
|
|
265
|
-
readonly maxBudgetUsd?: number | unknown | undefined;
|
|
266
252
|
readonly response?: string | unknown | undefined;
|
|
267
|
-
readonly delayMs?: number | unknown | undefined;
|
|
268
|
-
readonly delayMinMs?: number | unknown | undefined;
|
|
269
|
-
readonly delayMaxMs?: number | unknown | undefined;
|
|
270
253
|
readonly wait?: boolean | unknown | undefined;
|
|
271
254
|
readonly dry_run?: boolean | unknown | undefined;
|
|
272
|
-
readonly dryRun?: boolean | unknown | undefined;
|
|
273
255
|
readonly subagent_root?: string | unknown | undefined;
|
|
274
|
-
readonly subagentRoot?: string | unknown | undefined;
|
|
275
256
|
readonly workspace_template?: string | unknown | undefined;
|
|
276
|
-
readonly workspaceTemplate?: string | unknown | undefined;
|
|
277
257
|
readonly files_format?: string | unknown | undefined;
|
|
278
|
-
readonly filesFormat?: string | unknown | undefined;
|
|
279
258
|
readonly attachments_format?: string | unknown | undefined;
|
|
280
|
-
readonly attachmentsFormat?: string | unknown | undefined;
|
|
281
259
|
readonly env?: unknown | undefined;
|
|
282
260
|
readonly healthcheck?: unknown | undefined;
|
|
283
261
|
readonly session_dir?: string | unknown | undefined;
|
|
284
|
-
readonly sessionDir?: string | unknown | undefined;
|
|
285
262
|
readonly session_id?: string | unknown | undefined;
|
|
286
|
-
readonly sessionId?: string | unknown | undefined;
|
|
287
263
|
readonly discover?: string | unknown | undefined;
|
|
288
264
|
readonly session_state_dir?: string | unknown | undefined;
|
|
289
|
-
readonly sessionStateDir?: string | unknown | undefined;
|
|
290
265
|
readonly cli_url?: string | unknown | undefined;
|
|
291
|
-
readonly cliUrl?: string | unknown | undefined;
|
|
292
266
|
readonly cli_path?: string | unknown | undefined;
|
|
293
|
-
readonly cliPath?: string | unknown | undefined;
|
|
294
267
|
readonly github_token?: string | unknown | undefined;
|
|
295
|
-
readonly
|
|
268
|
+
readonly byok?: Record<string, unknown> | undefined;
|
|
296
269
|
readonly max_retries?: number | unknown | undefined;
|
|
297
|
-
readonly maxRetries?: number | unknown | undefined;
|
|
298
270
|
readonly retry_initial_delay_ms?: number | unknown | undefined;
|
|
299
|
-
readonly retryInitialDelayMs?: number | unknown | undefined;
|
|
300
271
|
readonly retry_max_delay_ms?: number | unknown | undefined;
|
|
301
|
-
readonly retryMaxDelayMs?: number | unknown | undefined;
|
|
302
272
|
readonly retry_backoff_factor?: number | unknown | undefined;
|
|
303
|
-
readonly retryBackoffFactor?: number | unknown | undefined;
|
|
304
273
|
readonly retry_status_codes?: unknown | undefined;
|
|
305
|
-
readonly retryStatusCodes?: unknown | undefined;
|
|
306
274
|
readonly fallback_targets?: readonly string[] | unknown | undefined;
|
|
307
|
-
readonly fallbackTargets?: readonly string[] | unknown | undefined;
|
|
308
275
|
}
|
|
309
276
|
|
|
310
277
|
/**
|
|
@@ -375,6 +342,8 @@ interface ToolTrajectoryEvaluatorConfig {
|
|
|
375
342
|
/** Optional weight for top-level aggregation (defaults to 1.0) */
|
|
376
343
|
readonly weight?: number;
|
|
377
344
|
readonly required?: boolean | number;
|
|
345
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
346
|
+
readonly min_score?: number;
|
|
378
347
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
379
348
|
readonly negate?: boolean;
|
|
380
349
|
/** Default argument matching mode for all expected items (defaults to 'exact') */
|
|
@@ -667,6 +636,8 @@ type CodeEvaluatorConfig = {
|
|
|
667
636
|
readonly resolvedCwd?: string;
|
|
668
637
|
readonly weight?: number;
|
|
669
638
|
readonly required?: boolean | number;
|
|
639
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
640
|
+
readonly min_score?: number;
|
|
670
641
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
671
642
|
readonly negate?: boolean;
|
|
672
643
|
/** Pass-through configuration for the code-grader (any unrecognized YAML properties) */
|
|
@@ -699,6 +670,8 @@ type LlmGraderEvaluatorConfig = {
|
|
|
699
670
|
readonly rubrics?: readonly RubricItem[];
|
|
700
671
|
readonly weight?: number;
|
|
701
672
|
readonly required?: boolean | number;
|
|
673
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
674
|
+
readonly min_score?: number;
|
|
702
675
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
703
676
|
readonly negate?: boolean;
|
|
704
677
|
/** Optional target override for this grader (uses a named LLM target from targets.yaml). */
|
|
@@ -737,13 +710,17 @@ type RubricItem = {
|
|
|
737
710
|
readonly outcome?: string;
|
|
738
711
|
readonly weight: number;
|
|
739
712
|
/**
|
|
740
|
-
* Legacy boolean gating (
|
|
741
|
-
* Use required_min_score instead for finer control.
|
|
713
|
+
* Legacy boolean gating (treated as min_score: 1.0 for score-range rubrics).
|
|
742
714
|
*/
|
|
743
715
|
readonly required?: boolean;
|
|
744
716
|
/**
|
|
745
|
-
* Minimum score (0-
|
|
746
|
-
*
|
|
717
|
+
* Minimum score (0-1 scale) required to pass this criterion.
|
|
718
|
+
* Internally compared against normalized score (rawScore / 10).
|
|
719
|
+
*/
|
|
720
|
+
readonly min_score?: number;
|
|
721
|
+
/**
|
|
722
|
+
* @deprecated Use min_score (0-1 scale) instead.
|
|
723
|
+
* Legacy: minimum score on 0-10 integer scale.
|
|
747
724
|
*/
|
|
748
725
|
readonly required_min_score?: number;
|
|
749
726
|
/**
|
|
@@ -776,6 +753,8 @@ type CompositeEvaluatorConfig = {
|
|
|
776
753
|
readonly aggregator: CompositeAggregatorConfig;
|
|
777
754
|
readonly weight?: number;
|
|
778
755
|
readonly required?: boolean | number;
|
|
756
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
757
|
+
readonly min_score?: number;
|
|
779
758
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
780
759
|
readonly negate?: boolean;
|
|
781
760
|
};
|
|
@@ -820,6 +799,8 @@ type FieldAccuracyEvaluatorConfig = {
|
|
|
820
799
|
readonly aggregation?: FieldAggregationType;
|
|
821
800
|
readonly weight?: number;
|
|
822
801
|
readonly required?: boolean | number;
|
|
802
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
803
|
+
readonly min_score?: number;
|
|
823
804
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
824
805
|
readonly negate?: boolean;
|
|
825
806
|
};
|
|
@@ -834,6 +815,8 @@ type LatencyEvaluatorConfig = {
|
|
|
834
815
|
readonly threshold: number;
|
|
835
816
|
readonly weight?: number;
|
|
836
817
|
readonly required?: boolean | number;
|
|
818
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
819
|
+
readonly min_score?: number;
|
|
837
820
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
838
821
|
readonly negate?: boolean;
|
|
839
822
|
};
|
|
@@ -848,6 +831,8 @@ type CostEvaluatorConfig = {
|
|
|
848
831
|
readonly budget: number;
|
|
849
832
|
readonly weight?: number;
|
|
850
833
|
readonly required?: boolean | number;
|
|
834
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
835
|
+
readonly min_score?: number;
|
|
851
836
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
852
837
|
readonly negate?: boolean;
|
|
853
838
|
};
|
|
@@ -866,6 +851,8 @@ type TokenUsageEvaluatorConfig = {
|
|
|
866
851
|
readonly max_output?: number;
|
|
867
852
|
readonly weight?: number;
|
|
868
853
|
readonly required?: boolean | number;
|
|
854
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
855
|
+
readonly min_score?: number;
|
|
869
856
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
870
857
|
readonly negate?: boolean;
|
|
871
858
|
};
|
|
@@ -893,6 +880,8 @@ type ExecutionMetricsEvaluatorConfig = {
|
|
|
893
880
|
readonly exploration_tolerance?: number;
|
|
894
881
|
readonly weight?: number;
|
|
895
882
|
readonly required?: boolean | number;
|
|
883
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
884
|
+
readonly min_score?: number;
|
|
896
885
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
897
886
|
readonly negate?: boolean;
|
|
898
887
|
};
|
|
@@ -906,6 +895,8 @@ type ContainsEvaluatorConfig = {
|
|
|
906
895
|
readonly value: string;
|
|
907
896
|
readonly weight?: number;
|
|
908
897
|
readonly required?: boolean | number;
|
|
898
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
899
|
+
readonly min_score?: number;
|
|
909
900
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
910
901
|
readonly negate?: boolean;
|
|
911
902
|
};
|
|
@@ -919,6 +910,8 @@ type ContainsAnyEvaluatorConfig = {
|
|
|
919
910
|
readonly value: readonly string[];
|
|
920
911
|
readonly weight?: number;
|
|
921
912
|
readonly required?: boolean | number;
|
|
913
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
914
|
+
readonly min_score?: number;
|
|
922
915
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
923
916
|
readonly negate?: boolean;
|
|
924
917
|
};
|
|
@@ -932,6 +925,8 @@ type ContainsAllEvaluatorConfig = {
|
|
|
932
925
|
readonly value: readonly string[];
|
|
933
926
|
readonly weight?: number;
|
|
934
927
|
readonly required?: boolean | number;
|
|
928
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
929
|
+
readonly min_score?: number;
|
|
935
930
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
936
931
|
readonly negate?: boolean;
|
|
937
932
|
};
|
|
@@ -945,6 +940,8 @@ type IcontainsEvaluatorConfig = {
|
|
|
945
940
|
readonly value: string;
|
|
946
941
|
readonly weight?: number;
|
|
947
942
|
readonly required?: boolean | number;
|
|
943
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
944
|
+
readonly min_score?: number;
|
|
948
945
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
949
946
|
readonly negate?: boolean;
|
|
950
947
|
};
|
|
@@ -958,6 +955,8 @@ type IcontainsAnyEvaluatorConfig = {
|
|
|
958
955
|
readonly value: readonly string[];
|
|
959
956
|
readonly weight?: number;
|
|
960
957
|
readonly required?: boolean | number;
|
|
958
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
959
|
+
readonly min_score?: number;
|
|
961
960
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
962
961
|
readonly negate?: boolean;
|
|
963
962
|
};
|
|
@@ -971,6 +970,8 @@ type IcontainsAllEvaluatorConfig = {
|
|
|
971
970
|
readonly value: readonly string[];
|
|
972
971
|
readonly weight?: number;
|
|
973
972
|
readonly required?: boolean | number;
|
|
973
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
974
|
+
readonly min_score?: number;
|
|
974
975
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
975
976
|
readonly negate?: boolean;
|
|
976
977
|
};
|
|
@@ -984,6 +985,8 @@ type StartsWithEvaluatorConfig = {
|
|
|
984
985
|
readonly value: string;
|
|
985
986
|
readonly weight?: number;
|
|
986
987
|
readonly required?: boolean | number;
|
|
988
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
989
|
+
readonly min_score?: number;
|
|
987
990
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
988
991
|
readonly negate?: boolean;
|
|
989
992
|
};
|
|
@@ -997,6 +1000,8 @@ type EndsWithEvaluatorConfig = {
|
|
|
997
1000
|
readonly value: string;
|
|
998
1001
|
readonly weight?: number;
|
|
999
1002
|
readonly required?: boolean | number;
|
|
1003
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1004
|
+
readonly min_score?: number;
|
|
1000
1005
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
1001
1006
|
readonly negate?: boolean;
|
|
1002
1007
|
};
|
|
@@ -1012,6 +1017,8 @@ type RegexEvaluatorConfig = {
|
|
|
1012
1017
|
readonly flags?: string;
|
|
1013
1018
|
readonly weight?: number;
|
|
1014
1019
|
readonly required?: boolean | number;
|
|
1020
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1021
|
+
readonly min_score?: number;
|
|
1015
1022
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
1016
1023
|
readonly negate?: boolean;
|
|
1017
1024
|
};
|
|
@@ -1024,6 +1031,8 @@ type IsJsonEvaluatorConfig = {
|
|
|
1024
1031
|
readonly type: 'is-json';
|
|
1025
1032
|
readonly weight?: number;
|
|
1026
1033
|
readonly required?: boolean | number;
|
|
1034
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1035
|
+
readonly min_score?: number;
|
|
1027
1036
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
1028
1037
|
readonly negate?: boolean;
|
|
1029
1038
|
};
|
|
@@ -1037,6 +1046,8 @@ type EqualsEvaluatorConfig = {
|
|
|
1037
1046
|
readonly value: string;
|
|
1038
1047
|
readonly weight?: number;
|
|
1039
1048
|
readonly required?: boolean | number;
|
|
1049
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1050
|
+
readonly min_score?: number;
|
|
1040
1051
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
1041
1052
|
readonly negate?: boolean;
|
|
1042
1053
|
};
|
|
@@ -1050,6 +1061,8 @@ type RubricsEvaluatorConfig = {
|
|
|
1050
1061
|
readonly criteria: readonly RubricItem[];
|
|
1051
1062
|
readonly weight?: number;
|
|
1052
1063
|
readonly required?: boolean | number;
|
|
1064
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1065
|
+
readonly min_score?: number;
|
|
1053
1066
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
1054
1067
|
readonly negate?: boolean;
|
|
1055
1068
|
};
|
|
@@ -1068,6 +1081,8 @@ type SkillTriggerEvaluatorConfig = {
|
|
|
1068
1081
|
readonly should_trigger?: boolean;
|
|
1069
1082
|
readonly weight?: number;
|
|
1070
1083
|
readonly required?: boolean | number;
|
|
1084
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1085
|
+
readonly min_score?: number;
|
|
1071
1086
|
readonly negate?: boolean;
|
|
1072
1087
|
};
|
|
1073
1088
|
/**
|
|
@@ -1079,6 +1094,8 @@ type InlineAssertEvaluatorConfig = {
|
|
|
1079
1094
|
readonly type: 'inline-assert';
|
|
1080
1095
|
readonly weight?: number;
|
|
1081
1096
|
readonly required?: boolean | number;
|
|
1097
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1098
|
+
readonly min_score?: number;
|
|
1082
1099
|
readonly negate?: boolean;
|
|
1083
1100
|
};
|
|
1084
1101
|
type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | SkillTriggerEvaluatorConfig | ContainsEvaluatorConfig | ContainsAnyEvaluatorConfig | ContainsAllEvaluatorConfig | IcontainsEvaluatorConfig | IcontainsAnyEvaluatorConfig | IcontainsAllEvaluatorConfig | StartsWithEvaluatorConfig | EndsWithEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig | InlineAssertEvaluatorConfig;
|
|
@@ -1087,7 +1104,7 @@ type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | Composit
|
|
|
1087
1104
|
*/
|
|
1088
1105
|
interface EvalTest {
|
|
1089
1106
|
readonly id: string;
|
|
1090
|
-
readonly
|
|
1107
|
+
readonly suite?: string;
|
|
1091
1108
|
readonly category?: string;
|
|
1092
1109
|
readonly conversation_id?: string;
|
|
1093
1110
|
readonly question: string;
|
|
@@ -1104,6 +1121,8 @@ interface EvalTest {
|
|
|
1104
1121
|
readonly metadata?: Record<string, unknown>;
|
|
1105
1122
|
/** Per-test target override (matrix evaluation) */
|
|
1106
1123
|
readonly targets?: readonly string[];
|
|
1124
|
+
/** Per-test score threshold override (0-1). Resolution: CLI > test > suite > DEFAULT_THRESHOLD. */
|
|
1125
|
+
readonly threshold?: number;
|
|
1107
1126
|
}
|
|
1108
1127
|
/** @deprecated Use `EvalTest` instead */
|
|
1109
1128
|
type EvalCase = EvalTest;
|
|
@@ -1197,7 +1216,7 @@ type FailOnError = boolean;
|
|
|
1197
1216
|
interface EvaluationResult {
|
|
1198
1217
|
readonly timestamp: string;
|
|
1199
1218
|
readonly testId: string;
|
|
1200
|
-
readonly
|
|
1219
|
+
readonly suite?: string;
|
|
1201
1220
|
readonly category?: string;
|
|
1202
1221
|
readonly conversationId?: string;
|
|
1203
1222
|
readonly score: number;
|
|
@@ -1427,8 +1446,8 @@ declare function detectFormat(filePath: string): 'yaml' | 'jsonl' | 'agent-skill
|
|
|
1427
1446
|
|
|
1428
1447
|
type LoadOptions = {
|
|
1429
1448
|
readonly verbose?: boolean;
|
|
1430
|
-
/** Filter tests by ID pattern (glob supported, e.g., "summary-*") */
|
|
1431
|
-
readonly filter?: string;
|
|
1449
|
+
/** Filter tests by ID pattern(s) (glob supported, e.g., "summary-*"). Arrays use OR logic. */
|
|
1450
|
+
readonly filter?: string | readonly string[];
|
|
1432
1451
|
/** Category derived from the eval file's directory path */
|
|
1433
1452
|
readonly category?: string;
|
|
1434
1453
|
};
|
|
@@ -1599,7 +1618,7 @@ declare function resolveFileReference(rawValue: string, searchRoots: readonly st
|
|
|
1599
1618
|
/**
|
|
1600
1619
|
* Strict normalized schema for CLI target configuration.
|
|
1601
1620
|
* This is the final validated shape after environment variable resolution
|
|
1602
|
-
* and
|
|
1621
|
+
* and internal field normalization.
|
|
1603
1622
|
*
|
|
1604
1623
|
* Uses .strict() to reject unknown properties, ensuring configuration
|
|
1605
1624
|
* errors are caught early rather than silently ignored.
|
|
@@ -1648,8 +1667,6 @@ declare const CliTargetConfigSchema: z.ZodObject<{
|
|
|
1648
1667
|
command: string;
|
|
1649
1668
|
verbose?: boolean | undefined;
|
|
1650
1669
|
cwd?: string | undefined;
|
|
1651
|
-
filesFormat?: string | undefined;
|
|
1652
|
-
workspaceTemplate?: string | undefined;
|
|
1653
1670
|
healthcheck?: {
|
|
1654
1671
|
url: string;
|
|
1655
1672
|
timeoutMs?: number | undefined;
|
|
@@ -1658,14 +1675,14 @@ declare const CliTargetConfigSchema: z.ZodObject<{
|
|
|
1658
1675
|
cwd?: string | undefined;
|
|
1659
1676
|
timeoutMs?: number | undefined;
|
|
1660
1677
|
} | undefined;
|
|
1661
|
-
keepTempFiles?: boolean | undefined;
|
|
1662
1678
|
timeoutMs?: number | undefined;
|
|
1679
|
+
filesFormat?: string | undefined;
|
|
1680
|
+
workspaceTemplate?: string | undefined;
|
|
1681
|
+
keepTempFiles?: boolean | undefined;
|
|
1663
1682
|
}, {
|
|
1664
1683
|
command: string;
|
|
1665
1684
|
verbose?: boolean | undefined;
|
|
1666
1685
|
cwd?: string | undefined;
|
|
1667
|
-
filesFormat?: string | undefined;
|
|
1668
|
-
workspaceTemplate?: string | undefined;
|
|
1669
1686
|
healthcheck?: {
|
|
1670
1687
|
url: string;
|
|
1671
1688
|
timeoutMs?: number | undefined;
|
|
@@ -1674,8 +1691,10 @@ declare const CliTargetConfigSchema: z.ZodObject<{
|
|
|
1674
1691
|
cwd?: string | undefined;
|
|
1675
1692
|
timeoutMs?: number | undefined;
|
|
1676
1693
|
} | undefined;
|
|
1677
|
-
keepTempFiles?: boolean | undefined;
|
|
1678
1694
|
timeoutMs?: number | undefined;
|
|
1695
|
+
filesFormat?: string | undefined;
|
|
1696
|
+
workspaceTemplate?: string | undefined;
|
|
1697
|
+
keepTempFiles?: boolean | undefined;
|
|
1679
1698
|
}>;
|
|
1680
1699
|
type CliNormalizedConfig = z.infer<typeof CliTargetConfigSchema>;
|
|
1681
1700
|
/**
|
|
@@ -1707,6 +1726,7 @@ interface AzureResolvedConfig {
|
|
|
1707
1726
|
readonly deploymentName: string;
|
|
1708
1727
|
readonly apiKey: string;
|
|
1709
1728
|
readonly version?: string;
|
|
1729
|
+
readonly apiFormat?: ApiFormat;
|
|
1710
1730
|
readonly temperature?: number;
|
|
1711
1731
|
readonly maxOutputTokens?: number;
|
|
1712
1732
|
readonly retry?: RetryConfig;
|
|
@@ -1787,6 +1807,18 @@ interface CopilotSdkResolvedConfig {
|
|
|
1787
1807
|
readonly logDir?: string;
|
|
1788
1808
|
readonly logFormat?: 'summary' | 'json';
|
|
1789
1809
|
readonly systemPrompt?: string;
|
|
1810
|
+
/** BYOK provider type: "azure", "openai", or "anthropic". */
|
|
1811
|
+
readonly byokType?: string;
|
|
1812
|
+
/** BYOK base URL for the provider endpoint. */
|
|
1813
|
+
readonly byokBaseUrl?: string;
|
|
1814
|
+
/** BYOK API key for authenticating with the provider. */
|
|
1815
|
+
readonly byokApiKey?: string;
|
|
1816
|
+
/** BYOK bearer token (takes precedence over apiKey when set). */
|
|
1817
|
+
readonly byokBearerToken?: string;
|
|
1818
|
+
/** BYOK Azure API version (e.g. "2024-10-21"). Only used when byokType is "azure". */
|
|
1819
|
+
readonly byokApiVersion?: string;
|
|
1820
|
+
/** BYOK wire API format: "completions" or "responses". */
|
|
1821
|
+
readonly byokWireApi?: string;
|
|
1790
1822
|
}
|
|
1791
1823
|
interface CopilotLogResolvedConfig {
|
|
1792
1824
|
/** Explicit path to a session directory containing events.jsonl. */
|
|
@@ -1931,15 +1963,20 @@ type ResolvedTarget = (ResolvedTargetBase & {
|
|
|
1931
1963
|
}) | (ResolvedTargetBase & {
|
|
1932
1964
|
readonly kind: 'cli';
|
|
1933
1965
|
readonly config: CliResolvedConfig;
|
|
1966
|
+
}) | (ResolvedTargetBase & {
|
|
1967
|
+
readonly kind: 'transcript';
|
|
1968
|
+
readonly config: Record<string, never>;
|
|
1934
1969
|
});
|
|
1935
1970
|
/**
|
|
1936
1971
|
* Optional settings accepted on ALL target definitions regardless of provider.
|
|
1937
1972
|
* Exported so the targets validator can reuse the same list — adding a field
|
|
1938
1973
|
* here automatically makes it valid in targets.yaml without a separate update.
|
|
1939
1974
|
*/
|
|
1940
|
-
declare const COMMON_TARGET_SETTINGS: readonly ["use_target", "provider_batching", "
|
|
1975
|
+
declare const COMMON_TARGET_SETTINGS: readonly ["use_target", "provider_batching", "subagent_mode_allowed", "fallback_targets"];
|
|
1941
1976
|
declare function resolveDelegatedTargetDefinition(name: string, definitions: ReadonlyMap<string, TargetDefinition>, env?: EnvLookup): TargetDefinition | undefined;
|
|
1942
|
-
declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string
|
|
1977
|
+
declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string, options?: {
|
|
1978
|
+
readonly emitDeprecationWarnings?: boolean;
|
|
1979
|
+
}): ResolvedTarget;
|
|
1943
1980
|
|
|
1944
1981
|
/**
|
|
1945
1982
|
* Extensible provider registry.
|
|
@@ -2204,19 +2241,25 @@ interface EvaluatorFactory {
|
|
|
2204
2241
|
*
|
|
2205
2242
|
* Scoring model:
|
|
2206
2243
|
* score ∈ [0, 1] — continuous quality signal
|
|
2207
|
-
* verdict — binary classification derived from score via
|
|
2244
|
+
* verdict — binary classification derived from score via threshold
|
|
2208
2245
|
*
|
|
2209
|
-
* score >=
|
|
2210
|
-
* score <
|
|
2246
|
+
* score >= threshold → 'pass'
|
|
2247
|
+
* score < threshold → 'fail'
|
|
2211
2248
|
* (infrastructure skip) → 'skip'
|
|
2212
2249
|
*
|
|
2213
|
-
*
|
|
2214
|
-
*
|
|
2250
|
+
* Scoring scale principle:
|
|
2251
|
+
* All user-configurable score thresholds use 0-1 scale.
|
|
2252
|
+
* The only 0-10 values in YAML are `score_ranges` which define LLM integer output band labels.
|
|
2253
|
+
*
|
|
2254
|
+
* Default threshold is 0.8. Override via CLI `--threshold`, suite `execution.threshold`,
|
|
2255
|
+
* or per-test `execution.threshold`. All verdict derivation flows through scoreToVerdict().
|
|
2215
2256
|
*/
|
|
2216
2257
|
|
|
2217
|
-
/**
|
|
2258
|
+
/** Default score threshold for pass verdict (0-1). Scores below this are fail. */
|
|
2259
|
+
declare const DEFAULT_THRESHOLD = 0.8;
|
|
2260
|
+
/** @deprecated Use DEFAULT_THRESHOLD instead. */
|
|
2218
2261
|
declare const PASS_THRESHOLD = 0.8;
|
|
2219
|
-
declare function scoreToVerdict(score: number): EvaluationVerdict;
|
|
2262
|
+
declare function scoreToVerdict(score: number, threshold?: number): EvaluationVerdict;
|
|
2220
2263
|
declare function clampScore(value: number): number;
|
|
2221
2264
|
declare function extractJsonBlob(text: string): string | undefined;
|
|
2222
2265
|
declare function parseJsonFromText(text: string): unknown;
|
|
@@ -2499,6 +2542,7 @@ declare class LlmGraderEvaluator implements Evaluator {
|
|
|
2499
2542
|
private buildScoreRangePrompt;
|
|
2500
2543
|
private buildRubricPrompt;
|
|
2501
2544
|
private runWithRetry;
|
|
2545
|
+
private generateStructuredResponse;
|
|
2502
2546
|
}
|
|
2503
2547
|
/**
|
|
2504
2548
|
* Build the mandatory output schema that all evaluators must follow.
|
|
@@ -2837,8 +2881,8 @@ interface RunEvaluationOptions {
|
|
|
2837
2881
|
readonly cache?: EvaluationCache;
|
|
2838
2882
|
readonly useCache?: boolean;
|
|
2839
2883
|
readonly now?: () => Date;
|
|
2840
|
-
/** Filter tests by ID pattern (glob supported, e.g., "summary-*") */
|
|
2841
|
-
readonly filter?: string;
|
|
2884
|
+
/** Filter tests by ID pattern(s) (glob supported, e.g., "summary-*"). Arrays use OR logic. */
|
|
2885
|
+
readonly filter?: string | readonly string[];
|
|
2842
2886
|
readonly verbose?: boolean;
|
|
2843
2887
|
readonly maxConcurrency?: number;
|
|
2844
2888
|
readonly evalCases?: readonly EvalTest[];
|
|
@@ -3008,6 +3052,8 @@ interface EvalAssertionInput {
|
|
|
3008
3052
|
readonly weight?: number;
|
|
3009
3053
|
/** Whether this assertion is required to pass */
|
|
3010
3054
|
readonly required?: boolean | number;
|
|
3055
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
3056
|
+
readonly min_score?: number;
|
|
3011
3057
|
/** Prompt file for llm_grader */
|
|
3012
3058
|
readonly prompt?: string;
|
|
3013
3059
|
/** Script for code_grader */
|
|
@@ -3042,8 +3088,8 @@ interface EvalConfig {
|
|
|
3042
3088
|
readonly task?: (input: string) => string | Promise<string>;
|
|
3043
3089
|
/** Suite-level assertions applied to all tests */
|
|
3044
3090
|
readonly assert?: readonly AssertEntry[];
|
|
3045
|
-
/** Filter tests by ID pattern (glob supported) */
|
|
3046
|
-
readonly filter?: string;
|
|
3091
|
+
/** Filter tests by ID pattern(s) (glob supported). Arrays use OR logic. */
|
|
3092
|
+
readonly filter?: string | readonly string[];
|
|
3047
3093
|
/** Maximum concurrent workers (default: 3) */
|
|
3048
3094
|
readonly workers?: number;
|
|
3049
3095
|
/** Maximum retries on failure (default: 2) */
|
|
@@ -3056,6 +3102,8 @@ interface EvalConfig {
|
|
|
3056
3102
|
readonly verbose?: boolean;
|
|
3057
3103
|
/** Callback for each completed result */
|
|
3058
3104
|
readonly onResult?: (result: EvaluationResult) => void;
|
|
3105
|
+
/** Score threshold for pass/fail (0-1). Default: 0.8 (DEFAULT_THRESHOLD). */
|
|
3106
|
+
readonly threshold?: number;
|
|
3059
3107
|
}
|
|
3060
3108
|
/**
|
|
3061
3109
|
* Summary statistics for an evaluation run.
|
|
@@ -3063,9 +3111,9 @@ interface EvalConfig {
|
|
|
3063
3111
|
interface EvalSummary {
|
|
3064
3112
|
/** Total number of test cases */
|
|
3065
3113
|
readonly total: number;
|
|
3066
|
-
/** Number of passing test cases (score >=
|
|
3114
|
+
/** Number of passing test cases (score >= threshold) */
|
|
3067
3115
|
readonly passed: number;
|
|
3068
|
-
/** Number of failing test cases (score <
|
|
3116
|
+
/** Number of failing test cases (score < threshold) */
|
|
3069
3117
|
readonly failed: number;
|
|
3070
3118
|
/** Total duration in milliseconds */
|
|
3071
3119
|
readonly durationMs: number;
|
|
@@ -3505,7 +3553,7 @@ declare class WorkspacePoolManager {
|
|
|
3505
3553
|
private removeAllSlots;
|
|
3506
3554
|
/**
|
|
3507
3555
|
* Reset an existing slot for reuse:
|
|
3508
|
-
* 1. Reset repos (git reset --hard
|
|
3556
|
+
* 1. Reset repos (fetch from origin when resolve=remote, then git reset --hard && git clean per repo)
|
|
3509
3557
|
* 2. Re-copy template files (skip repo directories)
|
|
3510
3558
|
*/
|
|
3511
3559
|
private resetSlot;
|
|
@@ -3811,15 +3859,21 @@ declare function discoverGraders(registry: EvaluatorRegistry, baseDir: string):
|
|
|
3811
3859
|
/**
|
|
3812
3860
|
* Core types for the transcript import pipeline.
|
|
3813
3861
|
*
|
|
3814
|
-
* A TranscriptEntry
|
|
3815
|
-
*
|
|
3862
|
+
* A TranscriptEntry is the internal (camelCase) representation of a parsed
|
|
3863
|
+
* session. A TranscriptJsonLine is the on-disk (snake_case) wire format
|
|
3864
|
+
* written to .agentv/transcripts/*.jsonl files.
|
|
3865
|
+
*
|
|
3866
|
+
* Flow:
|
|
3867
|
+
* raw session JSONL → parser → TranscriptEntry (internal)
|
|
3868
|
+
* TranscriptEntry → toTranscriptJsonLine() → JSONL on disk
|
|
3869
|
+
* JSONL on disk → readTranscriptJsonl() → TranscriptJsonLine[]
|
|
3816
3870
|
*
|
|
3817
|
-
*
|
|
3818
|
-
*
|
|
3871
|
+
* To add a new importer: write a parser that returns TranscriptEntry,
|
|
3872
|
+
* then use toTranscriptJsonLine() to serialize.
|
|
3819
3873
|
*/
|
|
3820
3874
|
|
|
3821
3875
|
/**
|
|
3822
|
-
* A parsed transcript: ordered messages plus session metadata.
|
|
3876
|
+
* A parsed transcript: ordered messages plus session metadata (internal camelCase).
|
|
3823
3877
|
*/
|
|
3824
3878
|
interface TranscriptEntry {
|
|
3825
3879
|
readonly messages: Message[];
|
|
@@ -3829,7 +3883,7 @@ interface TranscriptEntry {
|
|
|
3829
3883
|
readonly costUsd?: number | null;
|
|
3830
3884
|
}
|
|
3831
3885
|
/**
|
|
3832
|
-
* Metadata describing the origin of a transcript.
|
|
3886
|
+
* Metadata describing the origin of a transcript (internal camelCase).
|
|
3833
3887
|
*/
|
|
3834
3888
|
interface TranscriptSource {
|
|
3835
3889
|
readonly provider: string;
|
|
@@ -3837,7 +3891,45 @@ interface TranscriptSource {
|
|
|
3837
3891
|
readonly projectPath?: string;
|
|
3838
3892
|
readonly startedAt?: string;
|
|
3839
3893
|
readonly model?: string;
|
|
3894
|
+
readonly version?: string;
|
|
3895
|
+
readonly gitBranch?: string;
|
|
3896
|
+
readonly cwd?: string;
|
|
3897
|
+
}
|
|
3898
|
+
/**
|
|
3899
|
+
* One line in a transcript JSONL file (snake_case wire format).
|
|
3900
|
+
*
|
|
3901
|
+
* Each line is a self-contained test case with pre-populated output.
|
|
3902
|
+
* The `input` field is the first user message; the `output` field is the
|
|
3903
|
+
* full conversation (Message[]).
|
|
3904
|
+
*/
|
|
3905
|
+
interface TranscriptJsonLine {
|
|
3906
|
+
readonly input: string;
|
|
3907
|
+
readonly output: readonly Message[];
|
|
3908
|
+
readonly token_usage?: {
|
|
3909
|
+
readonly input: number;
|
|
3910
|
+
readonly output: number;
|
|
3911
|
+
readonly cached?: number;
|
|
3912
|
+
};
|
|
3913
|
+
readonly duration_ms?: number;
|
|
3914
|
+
readonly cost_usd?: number | null;
|
|
3915
|
+
readonly source: {
|
|
3916
|
+
readonly provider: string;
|
|
3917
|
+
readonly session_id: string;
|
|
3918
|
+
readonly model?: string;
|
|
3919
|
+
readonly timestamp?: string;
|
|
3920
|
+
readonly git_branch?: string;
|
|
3921
|
+
readonly cwd?: string;
|
|
3922
|
+
readonly version?: string;
|
|
3923
|
+
};
|
|
3840
3924
|
}
|
|
3925
|
+
/**
|
|
3926
|
+
* Convert a parsed TranscriptEntry to the on-disk JSONL wire format.
|
|
3927
|
+
*/
|
|
3928
|
+
declare function toTranscriptJsonLine(entry: TranscriptEntry): TranscriptJsonLine;
|
|
3929
|
+
/**
|
|
3930
|
+
* Read a transcript JSONL file and parse each line into a TranscriptJsonLine.
|
|
3931
|
+
*/
|
|
3932
|
+
declare function readTranscriptJsonl(filePath: string): Promise<TranscriptJsonLine[]>;
|
|
3841
3933
|
/**
|
|
3842
3934
|
* Read a JSONL transcript file and return its raw text.
|
|
3843
3935
|
* Throws if the file does not exist or cannot be read.
|
|
@@ -3871,6 +3963,70 @@ declare function readTranscriptFile(filePath: string): Promise<string>;
|
|
|
3871
3963
|
|
|
3872
3964
|
declare function parseClaudeSession(jsonl: string): TranscriptEntry;
|
|
3873
3965
|
|
|
3966
|
+
/**
|
|
3967
|
+
* Codex CLI session JSONL parser.
|
|
3968
|
+
*
|
|
3969
|
+
* Reads a Codex CLI rollout transcript
|
|
3970
|
+
* (~/.codex/sessions/YYYY/MM/DD/rollout-*.jsonl) and converts it to AgentV's
|
|
3971
|
+
* Message[] format.
|
|
3972
|
+
*
|
|
3973
|
+
* Each line is a JSON object with one of these top-level types:
|
|
3974
|
+
* session_meta → session metadata (id, cwd, cli_version, model)
|
|
3975
|
+
* turn_context → per-turn context (model, cwd, turn_id)
|
|
3976
|
+
* event_msg → events: task_started, task_complete, user_message,
|
|
3977
|
+
* agent_message, token_count
|
|
3978
|
+
* response_item → conversation items: message, function_call,
|
|
3979
|
+
* function_call_output, reasoning, custom_tool_call,
|
|
3980
|
+
* custom_tool_call_output
|
|
3981
|
+
*
|
|
3982
|
+
* Key behaviors:
|
|
3983
|
+
* - response_item with type=message and role=user → user Message
|
|
3984
|
+
* - response_item with type=message and role=assistant → assistant Message
|
|
3985
|
+
* - response_item with type=function_call → ToolCall (pending output)
|
|
3986
|
+
* - response_item with type=function_call_output → matched to pending call by call_id
|
|
3987
|
+
* - response_item with type=reasoning → skipped (thinking tokens)
|
|
3988
|
+
* - response_item with role=developer → skipped (system prompt)
|
|
3989
|
+
* - session_meta → source metadata (session_id, cwd, version, model)
|
|
3990
|
+
* - turn_context → model name extraction
|
|
3991
|
+
* - Duration is from first↔last event timestamp
|
|
3992
|
+
* - cost_usd is null (Codex CLI does not report per-session cost)
|
|
3993
|
+
* - Token usage not available from rollout format (rate limit info only)
|
|
3994
|
+
*
|
|
3995
|
+
* To add a new response_item type: add a case to the switch in parseCodexSession().
|
|
3996
|
+
*/
|
|
3997
|
+
|
|
3998
|
+
declare function parseCodexSession(jsonl: string): TranscriptEntry;
|
|
3999
|
+
|
|
4000
|
+
/**
|
|
4001
|
+
* Codex CLI session discovery.
|
|
4002
|
+
*
|
|
4003
|
+
* Scans ~/.codex/sessions/ for rollout JSONL files. Codex CLI stores sessions at:
|
|
4004
|
+
* ~/.codex/sessions/YYYY/MM/DD/rollout-<timestamp>-<uuid>.jsonl
|
|
4005
|
+
*
|
|
4006
|
+
* Sessions are returned sorted by modification time (most recent first).
|
|
4007
|
+
*/
|
|
4008
|
+
interface CodexSession {
|
|
4009
|
+
/** UUID from the filename */
|
|
4010
|
+
readonly sessionId: string;
|
|
4011
|
+
/** Full path to the JSONL file */
|
|
4012
|
+
readonly filePath: string;
|
|
4013
|
+
/** Filename (e.g., rollout-2026-03-29T14-22-01-<uuid>.jsonl) */
|
|
4014
|
+
readonly filename: string;
|
|
4015
|
+
/** Last modification time */
|
|
4016
|
+
readonly updatedAt: Date;
|
|
4017
|
+
}
|
|
4018
|
+
interface CodexDiscoverOptions {
|
|
4019
|
+
/** Filter by date string (YYYY-MM-DD). */
|
|
4020
|
+
readonly date?: string;
|
|
4021
|
+
/** Maximum number of sessions to return (default: 10). */
|
|
4022
|
+
readonly limit?: number;
|
|
4023
|
+
/** Override the default ~/.codex/sessions directory. */
|
|
4024
|
+
readonly sessionsDir?: string;
|
|
4025
|
+
/** Return only the most recent session. */
|
|
4026
|
+
readonly latest?: boolean;
|
|
4027
|
+
}
|
|
4028
|
+
declare function discoverCodexSessions(opts?: CodexDiscoverOptions): Promise<CodexSession[]>;
|
|
4029
|
+
|
|
3874
4030
|
/**
|
|
3875
4031
|
* Claude Code session discovery.
|
|
3876
4032
|
*
|
|
@@ -3907,9 +4063,80 @@ interface ClaudeDiscoverOptions {
|
|
|
3907
4063
|
}
|
|
3908
4064
|
declare function discoverClaudeSessions(opts?: ClaudeDiscoverOptions): Promise<ClaudeSession[]>;
|
|
3909
4065
|
|
|
4066
|
+
/**
|
|
4067
|
+
* Transcript provider — replays pre-recorded session transcripts through the
|
|
4068
|
+
* evaluation pipeline without invoking any live agent.
|
|
4069
|
+
*
|
|
4070
|
+
* Used by `agentv eval --transcript <file>` to grade imported sessions.
|
|
4071
|
+
*
|
|
4072
|
+
* How it works:
|
|
4073
|
+
* 1. Reads a transcript JSONL file (produced by `agentv import`)
|
|
4074
|
+
* 2. Each invocation pops the next line from the transcript
|
|
4075
|
+
* 3. Returns a ProviderResponse with pre-populated output, token usage, etc.
|
|
4076
|
+
* 4. Evaluators run identically to live eval — they see the same ProviderResponse
|
|
4077
|
+
*
|
|
4078
|
+
* The provider name in results is set to the source provider from the transcript
|
|
4079
|
+
* (e.g., "claude", "codex", "copilot").
|
|
4080
|
+
*/
|
|
4081
|
+
|
|
4082
|
+
declare class TranscriptProvider implements Provider {
|
|
4083
|
+
readonly id: string;
|
|
4084
|
+
readonly kind: "transcript";
|
|
4085
|
+
readonly targetName: string;
|
|
4086
|
+
private lines;
|
|
4087
|
+
private cursor;
|
|
4088
|
+
constructor(targetName: string, lines: TranscriptJsonLine[]);
|
|
4089
|
+
/**
|
|
4090
|
+
* Create a TranscriptProvider from a JSONL file path.
|
|
4091
|
+
*/
|
|
4092
|
+
static fromFile(filePath: string): Promise<TranscriptProvider>;
|
|
4093
|
+
get lineCount(): number;
|
|
4094
|
+
invoke(_request: ProviderRequest): Promise<ProviderResponse>;
|
|
4095
|
+
}
|
|
4096
|
+
|
|
4097
|
+
/**
|
|
4098
|
+
* Copilot CLI events.jsonl parser.
|
|
4099
|
+
*
|
|
4100
|
+
* Reads a Copilot CLI session transcript (events.jsonl) and converts it to
|
|
4101
|
+
* AgentV's Message[] format. Each line is a JSON object with:
|
|
4102
|
+
* { type, data: { ...payload }, id, timestamp, parentId }
|
|
4103
|
+
*
|
|
4104
|
+
* All event-specific fields live under event.data.*, while type, id, timestamp,
|
|
4105
|
+
* and parentId are at the top level.
|
|
4106
|
+
*
|
|
4107
|
+
* Supported event types:
|
|
4108
|
+
* session.start → session metadata (data.sessionId, data.context.cwd)
|
|
4109
|
+
* user.message → Message { role: 'user' }
|
|
4110
|
+
* assistant.message → Message { role: 'assistant', toolCalls from data.toolRequests }
|
|
4111
|
+
* skill.invoked → ToolCall { tool: 'Skill', input: { skill: data.name } }
|
|
4112
|
+
* tool.execution_start + tool.execution_complete → ToolCall with output
|
|
4113
|
+
* session.shutdown → token usage from data.modelMetrics, end timestamp
|
|
4114
|
+
*
|
|
4115
|
+
* To add a new event type:
|
|
4116
|
+
* 1. Add a case to the switch in parseCopilotEvents()
|
|
4117
|
+
* 2. Map it to a Message or ToolCall
|
|
4118
|
+
* 3. Add a test in copilot-log-parser.test.ts
|
|
4119
|
+
*/
|
|
4120
|
+
|
|
4121
|
+
interface CopilotSessionMeta {
|
|
4122
|
+
readonly sessionId: string;
|
|
4123
|
+
readonly model: string;
|
|
4124
|
+
readonly cwd: string;
|
|
4125
|
+
readonly repository?: string;
|
|
4126
|
+
readonly branch?: string;
|
|
4127
|
+
readonly startedAt?: string;
|
|
4128
|
+
}
|
|
4129
|
+
interface ParsedCopilotSession {
|
|
4130
|
+
readonly messages: Message[];
|
|
4131
|
+
readonly meta: CopilotSessionMeta;
|
|
4132
|
+
readonly tokenUsage?: ProviderTokenUsage;
|
|
4133
|
+
readonly durationMs?: number;
|
|
4134
|
+
}
|
|
4135
|
+
declare function parseCopilotEvents(eventsJsonl: string): ParsedCopilotSession;
|
|
4136
|
+
|
|
3910
4137
|
type AgentKernel = {
|
|
3911
4138
|
status: string;
|
|
3912
4139
|
};
|
|
3913
4140
|
declare function createAgentKernel(): AgentKernel;
|
|
3914
4141
|
|
|
3915
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentText, type CopilotCliResolvedConfig, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
|
4142
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|