@agentv/core 4.6.1 → 4.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-ZK4GG7PR.js → chunk-75RFVESM.js} +215 -127
- package/dist/chunk-75RFVESM.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +110 -95
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +30 -72
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1271 -465
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +288 -74
- package/dist/index.d.ts +288 -74
- package/dist/index.js +1024 -311
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-ZK4GG7PR.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -73,7 +73,7 @@ interface ChatMessage {
|
|
|
73
73
|
readonly name?: string;
|
|
74
74
|
}
|
|
75
75
|
type ChatPrompt = readonly ChatMessage[];
|
|
76
|
-
type ProviderKind = 'openai' | 'openrouter' | 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-sdk' | 'copilot-cli' | 'copilot-log' | 'pi-coding-agent' | 'pi-cli' | 'claude' | 'claude-cli' | 'claude-sdk' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders' | 'agentv';
|
|
76
|
+
type ProviderKind = 'openai' | 'openrouter' | 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-sdk' | 'copilot-cli' | 'copilot-log' | 'pi-coding-agent' | 'pi-cli' | 'claude' | 'claude-cli' | 'claude-sdk' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders' | 'agentv' | 'transcript';
|
|
77
77
|
/** Callbacks for real-time observability during provider execution */
|
|
78
78
|
interface ProviderStreamCallbacks {
|
|
79
79
|
onToolCallStart?: (toolName: string, toolCallId?: string) => void;
|
|
@@ -222,25 +222,19 @@ interface TargetDefinition {
|
|
|
222
222
|
readonly judge_target?: string | undefined;
|
|
223
223
|
readonly workers?: number | undefined;
|
|
224
224
|
readonly provider_batching?: boolean | undefined;
|
|
225
|
-
readonly
|
|
225
|
+
readonly subagent_mode_allowed?: boolean | undefined;
|
|
226
226
|
readonly endpoint?: string | unknown | undefined;
|
|
227
227
|
readonly base_url?: string | unknown | undefined;
|
|
228
|
-
readonly baseUrl?: string | unknown | undefined;
|
|
229
228
|
readonly resource?: string | unknown | undefined;
|
|
230
|
-
readonly resourceName?: string | unknown | undefined;
|
|
231
229
|
readonly api_key?: string | unknown | undefined;
|
|
232
|
-
readonly apiKey?: string | unknown | undefined;
|
|
233
230
|
readonly deployment?: string | unknown | undefined;
|
|
234
|
-
readonly deploymentName?: string | unknown | undefined;
|
|
235
231
|
readonly model?: string | unknown | undefined;
|
|
236
232
|
readonly version?: string | unknown | undefined;
|
|
237
233
|
readonly api_version?: string | unknown | undefined;
|
|
238
234
|
readonly variant?: string | unknown | undefined;
|
|
239
235
|
readonly thinking_budget?: number | unknown | undefined;
|
|
240
|
-
readonly thinkingBudget?: number | unknown | undefined;
|
|
241
236
|
readonly temperature?: number | unknown | undefined;
|
|
242
237
|
readonly max_output_tokens?: number | unknown | undefined;
|
|
243
|
-
readonly maxTokens?: number | unknown | undefined;
|
|
244
238
|
readonly executable?: string | unknown | undefined;
|
|
245
239
|
readonly command?: string | unknown | undefined;
|
|
246
240
|
readonly binary?: string | unknown | undefined;
|
|
@@ -248,63 +242,35 @@ interface TargetDefinition {
|
|
|
248
242
|
readonly arguments?: unknown | undefined;
|
|
249
243
|
readonly cwd?: string | unknown | undefined;
|
|
250
244
|
readonly timeout_seconds?: number | unknown | undefined;
|
|
251
|
-
readonly timeoutSeconds?: number | unknown | undefined;
|
|
252
245
|
readonly log_dir?: string | unknown | undefined;
|
|
253
|
-
readonly logDir?: string | unknown | undefined;
|
|
254
246
|
readonly log_directory?: string | unknown | undefined;
|
|
255
|
-
readonly logDirectory?: string | unknown | undefined;
|
|
256
247
|
readonly log_format?: string | unknown | undefined;
|
|
257
|
-
readonly logFormat?: string | unknown | undefined;
|
|
258
248
|
readonly log_output_format?: string | unknown | undefined;
|
|
259
|
-
readonly logOutputFormat?: string | unknown | undefined;
|
|
260
249
|
readonly system_prompt?: string | unknown | undefined;
|
|
261
|
-
readonly systemPrompt?: string | unknown | undefined;
|
|
262
250
|
readonly max_turns?: number | unknown | undefined;
|
|
263
|
-
readonly maxTurns?: number | unknown | undefined;
|
|
264
251
|
readonly max_budget_usd?: number | unknown | undefined;
|
|
265
|
-
readonly maxBudgetUsd?: number | unknown | undefined;
|
|
266
252
|
readonly response?: string | unknown | undefined;
|
|
267
|
-
readonly delayMs?: number | unknown | undefined;
|
|
268
|
-
readonly delayMinMs?: number | unknown | undefined;
|
|
269
|
-
readonly delayMaxMs?: number | unknown | undefined;
|
|
270
253
|
readonly wait?: boolean | unknown | undefined;
|
|
271
254
|
readonly dry_run?: boolean | unknown | undefined;
|
|
272
|
-
readonly dryRun?: boolean | unknown | undefined;
|
|
273
255
|
readonly subagent_root?: string | unknown | undefined;
|
|
274
|
-
readonly subagentRoot?: string | unknown | undefined;
|
|
275
256
|
readonly workspace_template?: string | unknown | undefined;
|
|
276
|
-
readonly workspaceTemplate?: string | unknown | undefined;
|
|
277
257
|
readonly files_format?: string | unknown | undefined;
|
|
278
|
-
readonly filesFormat?: string | unknown | undefined;
|
|
279
258
|
readonly attachments_format?: string | unknown | undefined;
|
|
280
|
-
readonly attachmentsFormat?: string | unknown | undefined;
|
|
281
259
|
readonly env?: unknown | undefined;
|
|
282
260
|
readonly healthcheck?: unknown | undefined;
|
|
283
261
|
readonly session_dir?: string | unknown | undefined;
|
|
284
|
-
readonly sessionDir?: string | unknown | undefined;
|
|
285
262
|
readonly session_id?: string | unknown | undefined;
|
|
286
|
-
readonly sessionId?: string | unknown | undefined;
|
|
287
263
|
readonly discover?: string | unknown | undefined;
|
|
288
264
|
readonly session_state_dir?: string | unknown | undefined;
|
|
289
|
-
readonly sessionStateDir?: string | unknown | undefined;
|
|
290
265
|
readonly cli_url?: string | unknown | undefined;
|
|
291
|
-
readonly cliUrl?: string | unknown | undefined;
|
|
292
266
|
readonly cli_path?: string | unknown | undefined;
|
|
293
|
-
readonly cliPath?: string | unknown | undefined;
|
|
294
267
|
readonly github_token?: string | unknown | undefined;
|
|
295
|
-
readonly githubToken?: string | unknown | undefined;
|
|
296
268
|
readonly max_retries?: number | unknown | undefined;
|
|
297
|
-
readonly maxRetries?: number | unknown | undefined;
|
|
298
269
|
readonly retry_initial_delay_ms?: number | unknown | undefined;
|
|
299
|
-
readonly retryInitialDelayMs?: number | unknown | undefined;
|
|
300
270
|
readonly retry_max_delay_ms?: number | unknown | undefined;
|
|
301
|
-
readonly retryMaxDelayMs?: number | unknown | undefined;
|
|
302
271
|
readonly retry_backoff_factor?: number | unknown | undefined;
|
|
303
|
-
readonly retryBackoffFactor?: number | unknown | undefined;
|
|
304
272
|
readonly retry_status_codes?: unknown | undefined;
|
|
305
|
-
readonly retryStatusCodes?: unknown | undefined;
|
|
306
273
|
readonly fallback_targets?: readonly string[] | unknown | undefined;
|
|
307
|
-
readonly fallbackTargets?: readonly string[] | unknown | undefined;
|
|
308
274
|
}
|
|
309
275
|
|
|
310
276
|
/**
|
|
@@ -375,6 +341,8 @@ interface ToolTrajectoryEvaluatorConfig {
|
|
|
375
341
|
/** Optional weight for top-level aggregation (defaults to 1.0) */
|
|
376
342
|
readonly weight?: number;
|
|
377
343
|
readonly required?: boolean | number;
|
|
344
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
345
|
+
readonly min_score?: number;
|
|
378
346
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
379
347
|
readonly negate?: boolean;
|
|
380
348
|
/** Default argument matching mode for all expected items (defaults to 'exact') */
|
|
@@ -667,6 +635,8 @@ type CodeEvaluatorConfig = {
|
|
|
667
635
|
readonly resolvedCwd?: string;
|
|
668
636
|
readonly weight?: number;
|
|
669
637
|
readonly required?: boolean | number;
|
|
638
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
639
|
+
readonly min_score?: number;
|
|
670
640
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
671
641
|
readonly negate?: boolean;
|
|
672
642
|
/** Pass-through configuration for the code-grader (any unrecognized YAML properties) */
|
|
@@ -699,6 +669,8 @@ type LlmGraderEvaluatorConfig = {
|
|
|
699
669
|
readonly rubrics?: readonly RubricItem[];
|
|
700
670
|
readonly weight?: number;
|
|
701
671
|
readonly required?: boolean | number;
|
|
672
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
673
|
+
readonly min_score?: number;
|
|
702
674
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
703
675
|
readonly negate?: boolean;
|
|
704
676
|
/** Optional target override for this grader (uses a named LLM target from targets.yaml). */
|
|
@@ -737,13 +709,17 @@ type RubricItem = {
|
|
|
737
709
|
readonly outcome?: string;
|
|
738
710
|
readonly weight: number;
|
|
739
711
|
/**
|
|
740
|
-
* Legacy boolean gating (
|
|
741
|
-
* Use required_min_score instead for finer control.
|
|
712
|
+
* Legacy boolean gating (treated as min_score: 1.0 for score-range rubrics).
|
|
742
713
|
*/
|
|
743
714
|
readonly required?: boolean;
|
|
744
715
|
/**
|
|
745
|
-
* Minimum score (0-
|
|
746
|
-
*
|
|
716
|
+
* Minimum score (0-1 scale) required to pass this criterion.
|
|
717
|
+
* Internally compared against normalized score (rawScore / 10).
|
|
718
|
+
*/
|
|
719
|
+
readonly min_score?: number;
|
|
720
|
+
/**
|
|
721
|
+
* @deprecated Use min_score (0-1 scale) instead.
|
|
722
|
+
* Legacy: minimum score on 0-10 integer scale.
|
|
747
723
|
*/
|
|
748
724
|
readonly required_min_score?: number;
|
|
749
725
|
/**
|
|
@@ -776,6 +752,8 @@ type CompositeEvaluatorConfig = {
|
|
|
776
752
|
readonly aggregator: CompositeAggregatorConfig;
|
|
777
753
|
readonly weight?: number;
|
|
778
754
|
readonly required?: boolean | number;
|
|
755
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
756
|
+
readonly min_score?: number;
|
|
779
757
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
780
758
|
readonly negate?: boolean;
|
|
781
759
|
};
|
|
@@ -820,6 +798,8 @@ type FieldAccuracyEvaluatorConfig = {
|
|
|
820
798
|
readonly aggregation?: FieldAggregationType;
|
|
821
799
|
readonly weight?: number;
|
|
822
800
|
readonly required?: boolean | number;
|
|
801
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
802
|
+
readonly min_score?: number;
|
|
823
803
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
824
804
|
readonly negate?: boolean;
|
|
825
805
|
};
|
|
@@ -834,6 +814,8 @@ type LatencyEvaluatorConfig = {
|
|
|
834
814
|
readonly threshold: number;
|
|
835
815
|
readonly weight?: number;
|
|
836
816
|
readonly required?: boolean | number;
|
|
817
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
818
|
+
readonly min_score?: number;
|
|
837
819
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
838
820
|
readonly negate?: boolean;
|
|
839
821
|
};
|
|
@@ -848,6 +830,8 @@ type CostEvaluatorConfig = {
|
|
|
848
830
|
readonly budget: number;
|
|
849
831
|
readonly weight?: number;
|
|
850
832
|
readonly required?: boolean | number;
|
|
833
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
834
|
+
readonly min_score?: number;
|
|
851
835
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
852
836
|
readonly negate?: boolean;
|
|
853
837
|
};
|
|
@@ -866,6 +850,8 @@ type TokenUsageEvaluatorConfig = {
|
|
|
866
850
|
readonly max_output?: number;
|
|
867
851
|
readonly weight?: number;
|
|
868
852
|
readonly required?: boolean | number;
|
|
853
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
854
|
+
readonly min_score?: number;
|
|
869
855
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
870
856
|
readonly negate?: boolean;
|
|
871
857
|
};
|
|
@@ -893,6 +879,8 @@ type ExecutionMetricsEvaluatorConfig = {
|
|
|
893
879
|
readonly exploration_tolerance?: number;
|
|
894
880
|
readonly weight?: number;
|
|
895
881
|
readonly required?: boolean | number;
|
|
882
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
883
|
+
readonly min_score?: number;
|
|
896
884
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
897
885
|
readonly negate?: boolean;
|
|
898
886
|
};
|
|
@@ -906,6 +894,8 @@ type ContainsEvaluatorConfig = {
|
|
|
906
894
|
readonly value: string;
|
|
907
895
|
readonly weight?: number;
|
|
908
896
|
readonly required?: boolean | number;
|
|
897
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
898
|
+
readonly min_score?: number;
|
|
909
899
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
910
900
|
readonly negate?: boolean;
|
|
911
901
|
};
|
|
@@ -919,6 +909,8 @@ type ContainsAnyEvaluatorConfig = {
|
|
|
919
909
|
readonly value: readonly string[];
|
|
920
910
|
readonly weight?: number;
|
|
921
911
|
readonly required?: boolean | number;
|
|
912
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
913
|
+
readonly min_score?: number;
|
|
922
914
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
923
915
|
readonly negate?: boolean;
|
|
924
916
|
};
|
|
@@ -932,6 +924,8 @@ type ContainsAllEvaluatorConfig = {
|
|
|
932
924
|
readonly value: readonly string[];
|
|
933
925
|
readonly weight?: number;
|
|
934
926
|
readonly required?: boolean | number;
|
|
927
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
928
|
+
readonly min_score?: number;
|
|
935
929
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
936
930
|
readonly negate?: boolean;
|
|
937
931
|
};
|
|
@@ -945,6 +939,8 @@ type IcontainsEvaluatorConfig = {
|
|
|
945
939
|
readonly value: string;
|
|
946
940
|
readonly weight?: number;
|
|
947
941
|
readonly required?: boolean | number;
|
|
942
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
943
|
+
readonly min_score?: number;
|
|
948
944
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
949
945
|
readonly negate?: boolean;
|
|
950
946
|
};
|
|
@@ -958,6 +954,8 @@ type IcontainsAnyEvaluatorConfig = {
|
|
|
958
954
|
readonly value: readonly string[];
|
|
959
955
|
readonly weight?: number;
|
|
960
956
|
readonly required?: boolean | number;
|
|
957
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
958
|
+
readonly min_score?: number;
|
|
961
959
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
962
960
|
readonly negate?: boolean;
|
|
963
961
|
};
|
|
@@ -971,6 +969,8 @@ type IcontainsAllEvaluatorConfig = {
|
|
|
971
969
|
readonly value: readonly string[];
|
|
972
970
|
readonly weight?: number;
|
|
973
971
|
readonly required?: boolean | number;
|
|
972
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
973
|
+
readonly min_score?: number;
|
|
974
974
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
975
975
|
readonly negate?: boolean;
|
|
976
976
|
};
|
|
@@ -984,6 +984,8 @@ type StartsWithEvaluatorConfig = {
|
|
|
984
984
|
readonly value: string;
|
|
985
985
|
readonly weight?: number;
|
|
986
986
|
readonly required?: boolean | number;
|
|
987
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
988
|
+
readonly min_score?: number;
|
|
987
989
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
988
990
|
readonly negate?: boolean;
|
|
989
991
|
};
|
|
@@ -997,6 +999,8 @@ type EndsWithEvaluatorConfig = {
|
|
|
997
999
|
readonly value: string;
|
|
998
1000
|
readonly weight?: number;
|
|
999
1001
|
readonly required?: boolean | number;
|
|
1002
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1003
|
+
readonly min_score?: number;
|
|
1000
1004
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
1001
1005
|
readonly negate?: boolean;
|
|
1002
1006
|
};
|
|
@@ -1012,6 +1016,8 @@ type RegexEvaluatorConfig = {
|
|
|
1012
1016
|
readonly flags?: string;
|
|
1013
1017
|
readonly weight?: number;
|
|
1014
1018
|
readonly required?: boolean | number;
|
|
1019
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1020
|
+
readonly min_score?: number;
|
|
1015
1021
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
1016
1022
|
readonly negate?: boolean;
|
|
1017
1023
|
};
|
|
@@ -1024,6 +1030,8 @@ type IsJsonEvaluatorConfig = {
|
|
|
1024
1030
|
readonly type: 'is-json';
|
|
1025
1031
|
readonly weight?: number;
|
|
1026
1032
|
readonly required?: boolean | number;
|
|
1033
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1034
|
+
readonly min_score?: number;
|
|
1027
1035
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
1028
1036
|
readonly negate?: boolean;
|
|
1029
1037
|
};
|
|
@@ -1037,6 +1045,8 @@ type EqualsEvaluatorConfig = {
|
|
|
1037
1045
|
readonly value: string;
|
|
1038
1046
|
readonly weight?: number;
|
|
1039
1047
|
readonly required?: boolean | number;
|
|
1048
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1049
|
+
readonly min_score?: number;
|
|
1040
1050
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
1041
1051
|
readonly negate?: boolean;
|
|
1042
1052
|
};
|
|
@@ -1050,6 +1060,8 @@ type RubricsEvaluatorConfig = {
|
|
|
1050
1060
|
readonly criteria: readonly RubricItem[];
|
|
1051
1061
|
readonly weight?: number;
|
|
1052
1062
|
readonly required?: boolean | number;
|
|
1063
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1064
|
+
readonly min_score?: number;
|
|
1053
1065
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
1054
1066
|
readonly negate?: boolean;
|
|
1055
1067
|
};
|
|
@@ -1068,6 +1080,8 @@ type SkillTriggerEvaluatorConfig = {
|
|
|
1068
1080
|
readonly should_trigger?: boolean;
|
|
1069
1081
|
readonly weight?: number;
|
|
1070
1082
|
readonly required?: boolean | number;
|
|
1083
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1084
|
+
readonly min_score?: number;
|
|
1071
1085
|
readonly negate?: boolean;
|
|
1072
1086
|
};
|
|
1073
1087
|
/**
|
|
@@ -1079,6 +1093,8 @@ type InlineAssertEvaluatorConfig = {
|
|
|
1079
1093
|
readonly type: 'inline-assert';
|
|
1080
1094
|
readonly weight?: number;
|
|
1081
1095
|
readonly required?: boolean | number;
|
|
1096
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
1097
|
+
readonly min_score?: number;
|
|
1082
1098
|
readonly negate?: boolean;
|
|
1083
1099
|
};
|
|
1084
1100
|
type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | SkillTriggerEvaluatorConfig | ContainsEvaluatorConfig | ContainsAnyEvaluatorConfig | ContainsAllEvaluatorConfig | IcontainsEvaluatorConfig | IcontainsAnyEvaluatorConfig | IcontainsAllEvaluatorConfig | StartsWithEvaluatorConfig | EndsWithEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig | InlineAssertEvaluatorConfig;
|
|
@@ -1087,7 +1103,7 @@ type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | Composit
|
|
|
1087
1103
|
*/
|
|
1088
1104
|
interface EvalTest {
|
|
1089
1105
|
readonly id: string;
|
|
1090
|
-
readonly
|
|
1106
|
+
readonly suite?: string;
|
|
1091
1107
|
readonly category?: string;
|
|
1092
1108
|
readonly conversation_id?: string;
|
|
1093
1109
|
readonly question: string;
|
|
@@ -1104,6 +1120,8 @@ interface EvalTest {
|
|
|
1104
1120
|
readonly metadata?: Record<string, unknown>;
|
|
1105
1121
|
/** Per-test target override (matrix evaluation) */
|
|
1106
1122
|
readonly targets?: readonly string[];
|
|
1123
|
+
/** Per-test score threshold override (0-1). Resolution: CLI > test > suite > DEFAULT_THRESHOLD. */
|
|
1124
|
+
readonly threshold?: number;
|
|
1107
1125
|
}
|
|
1108
1126
|
/** @deprecated Use `EvalTest` instead */
|
|
1109
1127
|
type EvalCase = EvalTest;
|
|
@@ -1197,7 +1215,7 @@ type FailOnError = boolean;
|
|
|
1197
1215
|
interface EvaluationResult {
|
|
1198
1216
|
readonly timestamp: string;
|
|
1199
1217
|
readonly testId: string;
|
|
1200
|
-
readonly
|
|
1218
|
+
readonly suite?: string;
|
|
1201
1219
|
readonly category?: string;
|
|
1202
1220
|
readonly conversationId?: string;
|
|
1203
1221
|
readonly score: number;
|
|
@@ -1427,8 +1445,8 @@ declare function detectFormat(filePath: string): 'yaml' | 'jsonl' | 'agent-skill
|
|
|
1427
1445
|
|
|
1428
1446
|
type LoadOptions = {
|
|
1429
1447
|
readonly verbose?: boolean;
|
|
1430
|
-
/** Filter tests by ID pattern (glob supported, e.g., "summary-*") */
|
|
1431
|
-
readonly filter?: string;
|
|
1448
|
+
/** Filter tests by ID pattern(s) (glob supported, e.g., "summary-*"). Arrays use OR logic. */
|
|
1449
|
+
readonly filter?: string | readonly string[];
|
|
1432
1450
|
/** Category derived from the eval file's directory path */
|
|
1433
1451
|
readonly category?: string;
|
|
1434
1452
|
};
|
|
@@ -1599,7 +1617,7 @@ declare function resolveFileReference(rawValue: string, searchRoots: readonly st
|
|
|
1599
1617
|
/**
|
|
1600
1618
|
* Strict normalized schema for CLI target configuration.
|
|
1601
1619
|
* This is the final validated shape after environment variable resolution
|
|
1602
|
-
* and
|
|
1620
|
+
* and internal field normalization.
|
|
1603
1621
|
*
|
|
1604
1622
|
* Uses .strict() to reject unknown properties, ensuring configuration
|
|
1605
1623
|
* errors are caught early rather than silently ignored.
|
|
@@ -1648,8 +1666,6 @@ declare const CliTargetConfigSchema: z.ZodObject<{
|
|
|
1648
1666
|
command: string;
|
|
1649
1667
|
verbose?: boolean | undefined;
|
|
1650
1668
|
cwd?: string | undefined;
|
|
1651
|
-
filesFormat?: string | undefined;
|
|
1652
|
-
workspaceTemplate?: string | undefined;
|
|
1653
1669
|
healthcheck?: {
|
|
1654
1670
|
url: string;
|
|
1655
1671
|
timeoutMs?: number | undefined;
|
|
@@ -1658,14 +1674,14 @@ declare const CliTargetConfigSchema: z.ZodObject<{
|
|
|
1658
1674
|
cwd?: string | undefined;
|
|
1659
1675
|
timeoutMs?: number | undefined;
|
|
1660
1676
|
} | undefined;
|
|
1661
|
-
keepTempFiles?: boolean | undefined;
|
|
1662
1677
|
timeoutMs?: number | undefined;
|
|
1678
|
+
filesFormat?: string | undefined;
|
|
1679
|
+
workspaceTemplate?: string | undefined;
|
|
1680
|
+
keepTempFiles?: boolean | undefined;
|
|
1663
1681
|
}, {
|
|
1664
1682
|
command: string;
|
|
1665
1683
|
verbose?: boolean | undefined;
|
|
1666
1684
|
cwd?: string | undefined;
|
|
1667
|
-
filesFormat?: string | undefined;
|
|
1668
|
-
workspaceTemplate?: string | undefined;
|
|
1669
1685
|
healthcheck?: {
|
|
1670
1686
|
url: string;
|
|
1671
1687
|
timeoutMs?: number | undefined;
|
|
@@ -1674,8 +1690,10 @@ declare const CliTargetConfigSchema: z.ZodObject<{
|
|
|
1674
1690
|
cwd?: string | undefined;
|
|
1675
1691
|
timeoutMs?: number | undefined;
|
|
1676
1692
|
} | undefined;
|
|
1677
|
-
keepTempFiles?: boolean | undefined;
|
|
1678
1693
|
timeoutMs?: number | undefined;
|
|
1694
|
+
filesFormat?: string | undefined;
|
|
1695
|
+
workspaceTemplate?: string | undefined;
|
|
1696
|
+
keepTempFiles?: boolean | undefined;
|
|
1679
1697
|
}>;
|
|
1680
1698
|
type CliNormalizedConfig = z.infer<typeof CliTargetConfigSchema>;
|
|
1681
1699
|
/**
|
|
@@ -1707,6 +1725,7 @@ interface AzureResolvedConfig {
|
|
|
1707
1725
|
readonly deploymentName: string;
|
|
1708
1726
|
readonly apiKey: string;
|
|
1709
1727
|
readonly version?: string;
|
|
1728
|
+
readonly apiFormat?: ApiFormat;
|
|
1710
1729
|
readonly temperature?: number;
|
|
1711
1730
|
readonly maxOutputTokens?: number;
|
|
1712
1731
|
readonly retry?: RetryConfig;
|
|
@@ -1931,15 +1950,20 @@ type ResolvedTarget = (ResolvedTargetBase & {
|
|
|
1931
1950
|
}) | (ResolvedTargetBase & {
|
|
1932
1951
|
readonly kind: 'cli';
|
|
1933
1952
|
readonly config: CliResolvedConfig;
|
|
1953
|
+
}) | (ResolvedTargetBase & {
|
|
1954
|
+
readonly kind: 'transcript';
|
|
1955
|
+
readonly config: Record<string, never>;
|
|
1934
1956
|
});
|
|
1935
1957
|
/**
|
|
1936
1958
|
* Optional settings accepted on ALL target definitions regardless of provider.
|
|
1937
1959
|
* Exported so the targets validator can reuse the same list — adding a field
|
|
1938
1960
|
* here automatically makes it valid in targets.yaml without a separate update.
|
|
1939
1961
|
*/
|
|
1940
|
-
declare const COMMON_TARGET_SETTINGS: readonly ["use_target", "provider_batching", "
|
|
1962
|
+
declare const COMMON_TARGET_SETTINGS: readonly ["use_target", "provider_batching", "subagent_mode_allowed", "fallback_targets"];
|
|
1941
1963
|
declare function resolveDelegatedTargetDefinition(name: string, definitions: ReadonlyMap<string, TargetDefinition>, env?: EnvLookup): TargetDefinition | undefined;
|
|
1942
|
-
declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string
|
|
1964
|
+
declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string, options?: {
|
|
1965
|
+
readonly emitDeprecationWarnings?: boolean;
|
|
1966
|
+
}): ResolvedTarget;
|
|
1943
1967
|
|
|
1944
1968
|
/**
|
|
1945
1969
|
* Extensible provider registry.
|
|
@@ -2204,19 +2228,25 @@ interface EvaluatorFactory {
|
|
|
2204
2228
|
*
|
|
2205
2229
|
* Scoring model:
|
|
2206
2230
|
* score ∈ [0, 1] — continuous quality signal
|
|
2207
|
-
* verdict — binary classification derived from score via
|
|
2231
|
+
* verdict — binary classification derived from score via threshold
|
|
2208
2232
|
*
|
|
2209
|
-
* score >=
|
|
2210
|
-
* score <
|
|
2233
|
+
* score >= threshold → 'pass'
|
|
2234
|
+
* score < threshold → 'fail'
|
|
2211
2235
|
* (infrastructure skip) → 'skip'
|
|
2212
2236
|
*
|
|
2213
|
-
*
|
|
2214
|
-
*
|
|
2237
|
+
* Scoring scale principle:
|
|
2238
|
+
* All user-configurable score thresholds use 0-1 scale.
|
|
2239
|
+
* The only 0-10 values in YAML are `score_ranges` which define LLM integer output band labels.
|
|
2240
|
+
*
|
|
2241
|
+
* Default threshold is 0.8. Override via CLI `--threshold`, suite `execution.threshold`,
|
|
2242
|
+
* or per-test `execution.threshold`. All verdict derivation flows through scoreToVerdict().
|
|
2215
2243
|
*/
|
|
2216
2244
|
|
|
2217
|
-
/**
|
|
2245
|
+
/** Default score threshold for pass verdict (0-1). Scores below this are fail. */
|
|
2246
|
+
declare const DEFAULT_THRESHOLD = 0.8;
|
|
2247
|
+
/** @deprecated Use DEFAULT_THRESHOLD instead. */
|
|
2218
2248
|
declare const PASS_THRESHOLD = 0.8;
|
|
2219
|
-
declare function scoreToVerdict(score: number): EvaluationVerdict;
|
|
2249
|
+
declare function scoreToVerdict(score: number, threshold?: number): EvaluationVerdict;
|
|
2220
2250
|
declare function clampScore(value: number): number;
|
|
2221
2251
|
declare function extractJsonBlob(text: string): string | undefined;
|
|
2222
2252
|
declare function parseJsonFromText(text: string): unknown;
|
|
@@ -2499,6 +2529,7 @@ declare class LlmGraderEvaluator implements Evaluator {
|
|
|
2499
2529
|
private buildScoreRangePrompt;
|
|
2500
2530
|
private buildRubricPrompt;
|
|
2501
2531
|
private runWithRetry;
|
|
2532
|
+
private generateStructuredResponse;
|
|
2502
2533
|
}
|
|
2503
2534
|
/**
|
|
2504
2535
|
* Build the mandatory output schema that all evaluators must follow.
|
|
@@ -2837,8 +2868,8 @@ interface RunEvaluationOptions {
|
|
|
2837
2868
|
readonly cache?: EvaluationCache;
|
|
2838
2869
|
readonly useCache?: boolean;
|
|
2839
2870
|
readonly now?: () => Date;
|
|
2840
|
-
/** Filter tests by ID pattern (glob supported, e.g., "summary-*") */
|
|
2841
|
-
readonly filter?: string;
|
|
2871
|
+
/** Filter tests by ID pattern(s) (glob supported, e.g., "summary-*"). Arrays use OR logic. */
|
|
2872
|
+
readonly filter?: string | readonly string[];
|
|
2842
2873
|
readonly verbose?: boolean;
|
|
2843
2874
|
readonly maxConcurrency?: number;
|
|
2844
2875
|
readonly evalCases?: readonly EvalTest[];
|
|
@@ -3008,6 +3039,8 @@ interface EvalAssertionInput {
|
|
|
3008
3039
|
readonly weight?: number;
|
|
3009
3040
|
/** Whether this assertion is required to pass */
|
|
3010
3041
|
readonly required?: boolean | number;
|
|
3042
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
3043
|
+
readonly min_score?: number;
|
|
3011
3044
|
/** Prompt file for llm_grader */
|
|
3012
3045
|
readonly prompt?: string;
|
|
3013
3046
|
/** Script for code_grader */
|
|
@@ -3042,8 +3075,8 @@ interface EvalConfig {
|
|
|
3042
3075
|
readonly task?: (input: string) => string | Promise<string>;
|
|
3043
3076
|
/** Suite-level assertions applied to all tests */
|
|
3044
3077
|
readonly assert?: readonly AssertEntry[];
|
|
3045
|
-
/** Filter tests by ID pattern (glob supported) */
|
|
3046
|
-
readonly filter?: string;
|
|
3078
|
+
/** Filter tests by ID pattern(s) (glob supported). Arrays use OR logic. */
|
|
3079
|
+
readonly filter?: string | readonly string[];
|
|
3047
3080
|
/** Maximum concurrent workers (default: 3) */
|
|
3048
3081
|
readonly workers?: number;
|
|
3049
3082
|
/** Maximum retries on failure (default: 2) */
|
|
@@ -3056,6 +3089,8 @@ interface EvalConfig {
|
|
|
3056
3089
|
readonly verbose?: boolean;
|
|
3057
3090
|
/** Callback for each completed result */
|
|
3058
3091
|
readonly onResult?: (result: EvaluationResult) => void;
|
|
3092
|
+
/** Score threshold for pass/fail (0-1). Default: 0.8 (DEFAULT_THRESHOLD). */
|
|
3093
|
+
readonly threshold?: number;
|
|
3059
3094
|
}
|
|
3060
3095
|
/**
|
|
3061
3096
|
* Summary statistics for an evaluation run.
|
|
@@ -3063,9 +3098,9 @@ interface EvalConfig {
|
|
|
3063
3098
|
interface EvalSummary {
|
|
3064
3099
|
/** Total number of test cases */
|
|
3065
3100
|
readonly total: number;
|
|
3066
|
-
/** Number of passing test cases (score >=
|
|
3101
|
+
/** Number of passing test cases (score >= threshold) */
|
|
3067
3102
|
readonly passed: number;
|
|
3068
|
-
/** Number of failing test cases (score <
|
|
3103
|
+
/** Number of failing test cases (score < threshold) */
|
|
3069
3104
|
readonly failed: number;
|
|
3070
3105
|
/** Total duration in milliseconds */
|
|
3071
3106
|
readonly durationMs: number;
|
|
@@ -3505,7 +3540,7 @@ declare class WorkspacePoolManager {
|
|
|
3505
3540
|
private removeAllSlots;
|
|
3506
3541
|
/**
|
|
3507
3542
|
* Reset an existing slot for reuse:
|
|
3508
|
-
* 1. Reset repos (git reset --hard
|
|
3543
|
+
* 1. Reset repos (fetch from origin when resolve=remote, then git reset --hard && git clean per repo)
|
|
3509
3544
|
* 2. Re-copy template files (skip repo directories)
|
|
3510
3545
|
*/
|
|
3511
3546
|
private resetSlot;
|
|
@@ -3811,15 +3846,21 @@ declare function discoverGraders(registry: EvaluatorRegistry, baseDir: string):
|
|
|
3811
3846
|
/**
|
|
3812
3847
|
* Core types for the transcript import pipeline.
|
|
3813
3848
|
*
|
|
3814
|
-
* A TranscriptEntry
|
|
3815
|
-
*
|
|
3849
|
+
* A TranscriptEntry is the internal (camelCase) representation of a parsed
|
|
3850
|
+
* session. A TranscriptJsonLine is the on-disk (snake_case) wire format
|
|
3851
|
+
* written to .agentv/transcripts/*.jsonl files.
|
|
3852
|
+
*
|
|
3853
|
+
* Flow:
|
|
3854
|
+
* raw session JSONL → parser → TranscriptEntry (internal)
|
|
3855
|
+
* TranscriptEntry → toTranscriptJsonLine() → JSONL on disk
|
|
3856
|
+
* JSONL on disk → readTranscriptJsonl() → TranscriptJsonLine[]
|
|
3816
3857
|
*
|
|
3817
|
-
*
|
|
3818
|
-
*
|
|
3858
|
+
* To add a new importer: write a parser that returns TranscriptEntry,
|
|
3859
|
+
* then use toTranscriptJsonLine() to serialize.
|
|
3819
3860
|
*/
|
|
3820
3861
|
|
|
3821
3862
|
/**
|
|
3822
|
-
* A parsed transcript: ordered messages plus session metadata.
|
|
3863
|
+
* A parsed transcript: ordered messages plus session metadata (internal camelCase).
|
|
3823
3864
|
*/
|
|
3824
3865
|
interface TranscriptEntry {
|
|
3825
3866
|
readonly messages: Message[];
|
|
@@ -3829,7 +3870,7 @@ interface TranscriptEntry {
|
|
|
3829
3870
|
readonly costUsd?: number | null;
|
|
3830
3871
|
}
|
|
3831
3872
|
/**
|
|
3832
|
-
* Metadata describing the origin of a transcript.
|
|
3873
|
+
* Metadata describing the origin of a transcript (internal camelCase).
|
|
3833
3874
|
*/
|
|
3834
3875
|
interface TranscriptSource {
|
|
3835
3876
|
readonly provider: string;
|
|
@@ -3837,7 +3878,45 @@ interface TranscriptSource {
|
|
|
3837
3878
|
readonly projectPath?: string;
|
|
3838
3879
|
readonly startedAt?: string;
|
|
3839
3880
|
readonly model?: string;
|
|
3881
|
+
readonly version?: string;
|
|
3882
|
+
readonly gitBranch?: string;
|
|
3883
|
+
readonly cwd?: string;
|
|
3884
|
+
}
|
|
3885
|
+
/**
|
|
3886
|
+
* One line in a transcript JSONL file (snake_case wire format).
|
|
3887
|
+
*
|
|
3888
|
+
* Each line is a self-contained test case with pre-populated output.
|
|
3889
|
+
* The `input` field is the first user message; the `output` field is the
|
|
3890
|
+
* full conversation (Message[]).
|
|
3891
|
+
*/
|
|
3892
|
+
interface TranscriptJsonLine {
|
|
3893
|
+
readonly input: string;
|
|
3894
|
+
readonly output: readonly Message[];
|
|
3895
|
+
readonly token_usage?: {
|
|
3896
|
+
readonly input: number;
|
|
3897
|
+
readonly output: number;
|
|
3898
|
+
readonly cached?: number;
|
|
3899
|
+
};
|
|
3900
|
+
readonly duration_ms?: number;
|
|
3901
|
+
readonly cost_usd?: number | null;
|
|
3902
|
+
readonly source: {
|
|
3903
|
+
readonly provider: string;
|
|
3904
|
+
readonly session_id: string;
|
|
3905
|
+
readonly model?: string;
|
|
3906
|
+
readonly timestamp?: string;
|
|
3907
|
+
readonly git_branch?: string;
|
|
3908
|
+
readonly cwd?: string;
|
|
3909
|
+
readonly version?: string;
|
|
3910
|
+
};
|
|
3840
3911
|
}
|
|
3912
|
+
/**
|
|
3913
|
+
* Convert a parsed TranscriptEntry to the on-disk JSONL wire format.
|
|
3914
|
+
*/
|
|
3915
|
+
declare function toTranscriptJsonLine(entry: TranscriptEntry): TranscriptJsonLine;
|
|
3916
|
+
/**
|
|
3917
|
+
* Read a transcript JSONL file and parse each line into a TranscriptJsonLine.
|
|
3918
|
+
*/
|
|
3919
|
+
declare function readTranscriptJsonl(filePath: string): Promise<TranscriptJsonLine[]>;
|
|
3841
3920
|
/**
|
|
3842
3921
|
* Read a JSONL transcript file and return its raw text.
|
|
3843
3922
|
* Throws if the file does not exist or cannot be read.
|
|
@@ -3871,6 +3950,70 @@ declare function readTranscriptFile(filePath: string): Promise<string>;
|
|
|
3871
3950
|
|
|
3872
3951
|
declare function parseClaudeSession(jsonl: string): TranscriptEntry;
|
|
3873
3952
|
|
|
3953
|
+
/**
|
|
3954
|
+
* Codex CLI session JSONL parser.
|
|
3955
|
+
*
|
|
3956
|
+
* Reads a Codex CLI rollout transcript
|
|
3957
|
+
* (~/.codex/sessions/YYYY/MM/DD/rollout-*.jsonl) and converts it to AgentV's
|
|
3958
|
+
* Message[] format.
|
|
3959
|
+
*
|
|
3960
|
+
* Each line is a JSON object with one of these top-level types:
|
|
3961
|
+
* session_meta → session metadata (id, cwd, cli_version, model)
|
|
3962
|
+
* turn_context → per-turn context (model, cwd, turn_id)
|
|
3963
|
+
* event_msg → events: task_started, task_complete, user_message,
|
|
3964
|
+
* agent_message, token_count
|
|
3965
|
+
* response_item → conversation items: message, function_call,
|
|
3966
|
+
* function_call_output, reasoning, custom_tool_call,
|
|
3967
|
+
* custom_tool_call_output
|
|
3968
|
+
*
|
|
3969
|
+
* Key behaviors:
|
|
3970
|
+
* - response_item with type=message and role=user → user Message
|
|
3971
|
+
* - response_item with type=message and role=assistant → assistant Message
|
|
3972
|
+
* - response_item with type=function_call → ToolCall (pending output)
|
|
3973
|
+
* - response_item with type=function_call_output → matched to pending call by call_id
|
|
3974
|
+
* - response_item with type=reasoning → skipped (thinking tokens)
|
|
3975
|
+
* - response_item with role=developer → skipped (system prompt)
|
|
3976
|
+
* - session_meta → source metadata (session_id, cwd, version, model)
|
|
3977
|
+
* - turn_context → model name extraction
|
|
3978
|
+
* - Duration is from first↔last event timestamp
|
|
3979
|
+
* - cost_usd is null (Codex CLI does not report per-session cost)
|
|
3980
|
+
* - Token usage not available from rollout format (rate limit info only)
|
|
3981
|
+
*
|
|
3982
|
+
* To add a new response_item type: add a case to the switch in parseCodexSession().
|
|
3983
|
+
*/
|
|
3984
|
+
|
|
3985
|
+
declare function parseCodexSession(jsonl: string): TranscriptEntry;
|
|
3986
|
+
|
|
3987
|
+
/**
|
|
3988
|
+
* Codex CLI session discovery.
|
|
3989
|
+
*
|
|
3990
|
+
* Scans ~/.codex/sessions/ for rollout JSONL files. Codex CLI stores sessions at:
|
|
3991
|
+
* ~/.codex/sessions/YYYY/MM/DD/rollout-<timestamp>-<uuid>.jsonl
|
|
3992
|
+
*
|
|
3993
|
+
* Sessions are returned sorted by modification time (most recent first).
|
|
3994
|
+
*/
|
|
3995
|
+
interface CodexSession {
|
|
3996
|
+
/** UUID from the filename */
|
|
3997
|
+
readonly sessionId: string;
|
|
3998
|
+
/** Full path to the JSONL file */
|
|
3999
|
+
readonly filePath: string;
|
|
4000
|
+
/** Filename (e.g., rollout-2026-03-29T14-22-01-<uuid>.jsonl) */
|
|
4001
|
+
readonly filename: string;
|
|
4002
|
+
/** Last modification time */
|
|
4003
|
+
readonly updatedAt: Date;
|
|
4004
|
+
}
|
|
4005
|
+
interface CodexDiscoverOptions {
|
|
4006
|
+
/** Filter by date string (YYYY-MM-DD). */
|
|
4007
|
+
readonly date?: string;
|
|
4008
|
+
/** Maximum number of sessions to return (default: 10). */
|
|
4009
|
+
readonly limit?: number;
|
|
4010
|
+
/** Override the default ~/.codex/sessions directory. */
|
|
4011
|
+
readonly sessionsDir?: string;
|
|
4012
|
+
/** Return only the most recent session. */
|
|
4013
|
+
readonly latest?: boolean;
|
|
4014
|
+
}
|
|
4015
|
+
declare function discoverCodexSessions(opts?: CodexDiscoverOptions): Promise<CodexSession[]>;
|
|
4016
|
+
|
|
3874
4017
|
/**
|
|
3875
4018
|
* Claude Code session discovery.
|
|
3876
4019
|
*
|
|
@@ -3907,9 +4050,80 @@ interface ClaudeDiscoverOptions {
|
|
|
3907
4050
|
}
|
|
3908
4051
|
declare function discoverClaudeSessions(opts?: ClaudeDiscoverOptions): Promise<ClaudeSession[]>;
|
|
3909
4052
|
|
|
4053
|
+
/**
|
|
4054
|
+
* Transcript provider — replays pre-recorded session transcripts through the
|
|
4055
|
+
* evaluation pipeline without invoking any live agent.
|
|
4056
|
+
*
|
|
4057
|
+
* Used by `agentv eval --transcript <file>` to grade imported sessions.
|
|
4058
|
+
*
|
|
4059
|
+
* How it works:
|
|
4060
|
+
* 1. Reads a transcript JSONL file (produced by `agentv import`)
|
|
4061
|
+
* 2. Each invocation pops the next line from the transcript
|
|
4062
|
+
* 3. Returns a ProviderResponse with pre-populated output, token usage, etc.
|
|
4063
|
+
* 4. Evaluators run identically to live eval — they see the same ProviderResponse
|
|
4064
|
+
*
|
|
4065
|
+
* The provider name in results is set to the source provider from the transcript
|
|
4066
|
+
* (e.g., "claude", "codex", "copilot").
|
|
4067
|
+
*/
|
|
4068
|
+
|
|
4069
|
+
declare class TranscriptProvider implements Provider {
|
|
4070
|
+
readonly id: string;
|
|
4071
|
+
readonly kind: "transcript";
|
|
4072
|
+
readonly targetName: string;
|
|
4073
|
+
private lines;
|
|
4074
|
+
private cursor;
|
|
4075
|
+
constructor(targetName: string, lines: TranscriptJsonLine[]);
|
|
4076
|
+
/**
|
|
4077
|
+
* Create a TranscriptProvider from a JSONL file path.
|
|
4078
|
+
*/
|
|
4079
|
+
static fromFile(filePath: string): Promise<TranscriptProvider>;
|
|
4080
|
+
get lineCount(): number;
|
|
4081
|
+
invoke(_request: ProviderRequest): Promise<ProviderResponse>;
|
|
4082
|
+
}
|
|
4083
|
+
|
|
4084
|
+
/**
|
|
4085
|
+
* Copilot CLI events.jsonl parser.
|
|
4086
|
+
*
|
|
4087
|
+
* Reads a Copilot CLI session transcript (events.jsonl) and converts it to
|
|
4088
|
+
* AgentV's Message[] format. Each line is a JSON object with:
|
|
4089
|
+
* { type, data: { ...payload }, id, timestamp, parentId }
|
|
4090
|
+
*
|
|
4091
|
+
* All event-specific fields live under event.data.*, while type, id, timestamp,
|
|
4092
|
+
* and parentId are at the top level.
|
|
4093
|
+
*
|
|
4094
|
+
* Supported event types:
|
|
4095
|
+
* session.start → session metadata (data.sessionId, data.context.cwd)
|
|
4096
|
+
* user.message → Message { role: 'user' }
|
|
4097
|
+
* assistant.message → Message { role: 'assistant', toolCalls from data.toolRequests }
|
|
4098
|
+
* skill.invoked → ToolCall { tool: 'Skill', input: { skill: data.name } }
|
|
4099
|
+
* tool.execution_start + tool.execution_complete → ToolCall with output
|
|
4100
|
+
* session.shutdown → token usage from data.modelMetrics, end timestamp
|
|
4101
|
+
*
|
|
4102
|
+
* To add a new event type:
|
|
4103
|
+
* 1. Add a case to the switch in parseCopilotEvents()
|
|
4104
|
+
* 2. Map it to a Message or ToolCall
|
|
4105
|
+
* 3. Add a test in copilot-log-parser.test.ts
|
|
4106
|
+
*/
|
|
4107
|
+
|
|
4108
|
+
interface CopilotSessionMeta {
|
|
4109
|
+
readonly sessionId: string;
|
|
4110
|
+
readonly model: string;
|
|
4111
|
+
readonly cwd: string;
|
|
4112
|
+
readonly repository?: string;
|
|
4113
|
+
readonly branch?: string;
|
|
4114
|
+
readonly startedAt?: string;
|
|
4115
|
+
}
|
|
4116
|
+
interface ParsedCopilotSession {
|
|
4117
|
+
readonly messages: Message[];
|
|
4118
|
+
readonly meta: CopilotSessionMeta;
|
|
4119
|
+
readonly tokenUsage?: ProviderTokenUsage;
|
|
4120
|
+
readonly durationMs?: number;
|
|
4121
|
+
}
|
|
4122
|
+
declare function parseCopilotEvents(eventsJsonl: string): ParsedCopilotSession;
|
|
4123
|
+
|
|
3910
4124
|
type AgentKernel = {
|
|
3911
4125
|
status: string;
|
|
3912
4126
|
};
|
|
3913
4127
|
declare function createAgentKernel(): AgentKernel;
|
|
3914
4128
|
|
|
3915
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentText, type CopilotCliResolvedConfig, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
|
4129
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|