@agentv/core 4.6.1 → 4.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -73,7 +73,7 @@ interface ChatMessage {
73
73
  readonly name?: string;
74
74
  }
75
75
  type ChatPrompt = readonly ChatMessage[];
76
- type ProviderKind = 'openai' | 'openrouter' | 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-sdk' | 'copilot-cli' | 'copilot-log' | 'pi-coding-agent' | 'pi-cli' | 'claude' | 'claude-cli' | 'claude-sdk' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders' | 'agentv';
76
+ type ProviderKind = 'openai' | 'openrouter' | 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-sdk' | 'copilot-cli' | 'copilot-log' | 'pi-coding-agent' | 'pi-cli' | 'claude' | 'claude-cli' | 'claude-sdk' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders' | 'agentv' | 'transcript';
77
77
  /** Callbacks for real-time observability during provider execution */
78
78
  interface ProviderStreamCallbacks {
79
79
  onToolCallStart?: (toolName: string, toolCallId?: string) => void;
@@ -222,25 +222,19 @@ interface TargetDefinition {
222
222
  readonly judge_target?: string | undefined;
223
223
  readonly workers?: number | undefined;
224
224
  readonly provider_batching?: boolean | undefined;
225
- readonly providerBatching?: boolean | undefined;
225
+ readonly subagent_mode_allowed?: boolean | undefined;
226
226
  readonly endpoint?: string | unknown | undefined;
227
227
  readonly base_url?: string | unknown | undefined;
228
- readonly baseUrl?: string | unknown | undefined;
229
228
  readonly resource?: string | unknown | undefined;
230
- readonly resourceName?: string | unknown | undefined;
231
229
  readonly api_key?: string | unknown | undefined;
232
- readonly apiKey?: string | unknown | undefined;
233
230
  readonly deployment?: string | unknown | undefined;
234
- readonly deploymentName?: string | unknown | undefined;
235
231
  readonly model?: string | unknown | undefined;
236
232
  readonly version?: string | unknown | undefined;
237
233
  readonly api_version?: string | unknown | undefined;
238
234
  readonly variant?: string | unknown | undefined;
239
235
  readonly thinking_budget?: number | unknown | undefined;
240
- readonly thinkingBudget?: number | unknown | undefined;
241
236
  readonly temperature?: number | unknown | undefined;
242
237
  readonly max_output_tokens?: number | unknown | undefined;
243
- readonly maxTokens?: number | unknown | undefined;
244
238
  readonly executable?: string | unknown | undefined;
245
239
  readonly command?: string | unknown | undefined;
246
240
  readonly binary?: string | unknown | undefined;
@@ -248,63 +242,35 @@ interface TargetDefinition {
248
242
  readonly arguments?: unknown | undefined;
249
243
  readonly cwd?: string | unknown | undefined;
250
244
  readonly timeout_seconds?: number | unknown | undefined;
251
- readonly timeoutSeconds?: number | unknown | undefined;
252
245
  readonly log_dir?: string | unknown | undefined;
253
- readonly logDir?: string | unknown | undefined;
254
246
  readonly log_directory?: string | unknown | undefined;
255
- readonly logDirectory?: string | unknown | undefined;
256
247
  readonly log_format?: string | unknown | undefined;
257
- readonly logFormat?: string | unknown | undefined;
258
248
  readonly log_output_format?: string | unknown | undefined;
259
- readonly logOutputFormat?: string | unknown | undefined;
260
249
  readonly system_prompt?: string | unknown | undefined;
261
- readonly systemPrompt?: string | unknown | undefined;
262
250
  readonly max_turns?: number | unknown | undefined;
263
- readonly maxTurns?: number | unknown | undefined;
264
251
  readonly max_budget_usd?: number | unknown | undefined;
265
- readonly maxBudgetUsd?: number | unknown | undefined;
266
252
  readonly response?: string | unknown | undefined;
267
- readonly delayMs?: number | unknown | undefined;
268
- readonly delayMinMs?: number | unknown | undefined;
269
- readonly delayMaxMs?: number | unknown | undefined;
270
253
  readonly wait?: boolean | unknown | undefined;
271
254
  readonly dry_run?: boolean | unknown | undefined;
272
- readonly dryRun?: boolean | unknown | undefined;
273
255
  readonly subagent_root?: string | unknown | undefined;
274
- readonly subagentRoot?: string | unknown | undefined;
275
256
  readonly workspace_template?: string | unknown | undefined;
276
- readonly workspaceTemplate?: string | unknown | undefined;
277
257
  readonly files_format?: string | unknown | undefined;
278
- readonly filesFormat?: string | unknown | undefined;
279
258
  readonly attachments_format?: string | unknown | undefined;
280
- readonly attachmentsFormat?: string | unknown | undefined;
281
259
  readonly env?: unknown | undefined;
282
260
  readonly healthcheck?: unknown | undefined;
283
261
  readonly session_dir?: string | unknown | undefined;
284
- readonly sessionDir?: string | unknown | undefined;
285
262
  readonly session_id?: string | unknown | undefined;
286
- readonly sessionId?: string | unknown | undefined;
287
263
  readonly discover?: string | unknown | undefined;
288
264
  readonly session_state_dir?: string | unknown | undefined;
289
- readonly sessionStateDir?: string | unknown | undefined;
290
265
  readonly cli_url?: string | unknown | undefined;
291
- readonly cliUrl?: string | unknown | undefined;
292
266
  readonly cli_path?: string | unknown | undefined;
293
- readonly cliPath?: string | unknown | undefined;
294
267
  readonly github_token?: string | unknown | undefined;
295
- readonly githubToken?: string | unknown | undefined;
296
268
  readonly max_retries?: number | unknown | undefined;
297
- readonly maxRetries?: number | unknown | undefined;
298
269
  readonly retry_initial_delay_ms?: number | unknown | undefined;
299
- readonly retryInitialDelayMs?: number | unknown | undefined;
300
270
  readonly retry_max_delay_ms?: number | unknown | undefined;
301
- readonly retryMaxDelayMs?: number | unknown | undefined;
302
271
  readonly retry_backoff_factor?: number | unknown | undefined;
303
- readonly retryBackoffFactor?: number | unknown | undefined;
304
272
  readonly retry_status_codes?: unknown | undefined;
305
- readonly retryStatusCodes?: unknown | undefined;
306
273
  readonly fallback_targets?: readonly string[] | unknown | undefined;
307
- readonly fallbackTargets?: readonly string[] | unknown | undefined;
308
274
  }
309
275
 
310
276
  /**
@@ -375,6 +341,8 @@ interface ToolTrajectoryEvaluatorConfig {
375
341
  /** Optional weight for top-level aggregation (defaults to 1.0) */
376
342
  readonly weight?: number;
377
343
  readonly required?: boolean | number;
344
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
345
+ readonly min_score?: number;
378
346
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
379
347
  readonly negate?: boolean;
380
348
  /** Default argument matching mode for all expected items (defaults to 'exact') */
@@ -667,6 +635,8 @@ type CodeEvaluatorConfig = {
667
635
  readonly resolvedCwd?: string;
668
636
  readonly weight?: number;
669
637
  readonly required?: boolean | number;
638
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
639
+ readonly min_score?: number;
670
640
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
671
641
  readonly negate?: boolean;
672
642
  /** Pass-through configuration for the code-grader (any unrecognized YAML properties) */
@@ -699,6 +669,8 @@ type LlmGraderEvaluatorConfig = {
699
669
  readonly rubrics?: readonly RubricItem[];
700
670
  readonly weight?: number;
701
671
  readonly required?: boolean | number;
672
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
673
+ readonly min_score?: number;
702
674
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
703
675
  readonly negate?: boolean;
704
676
  /** Optional target override for this grader (uses a named LLM target from targets.yaml). */
@@ -737,13 +709,17 @@ type RubricItem = {
737
709
  readonly outcome?: string;
738
710
  readonly weight: number;
739
711
  /**
740
- * Legacy boolean gating (deprecated, treated as required_min_score: 10).
741
- * Use required_min_score instead for finer control.
712
+ * Legacy boolean gating (treated as min_score: 1.0 for score-range rubrics).
742
713
  */
743
714
  readonly required?: boolean;
744
715
  /**
745
- * Minimum score (0-10) required to pass this criterion.
746
- * If the criterion score is below this threshold, the overall verdict is 'fail'.
716
+ * Minimum score (0-1 scale) required to pass this criterion.
717
+ * Internally compared against normalized score (rawScore / 10).
718
+ */
719
+ readonly min_score?: number;
720
+ /**
721
+ * @deprecated Use min_score (0-1 scale) instead.
722
+ * Legacy: minimum score on 0-10 integer scale.
747
723
  */
748
724
  readonly required_min_score?: number;
749
725
  /**
@@ -776,6 +752,8 @@ type CompositeEvaluatorConfig = {
776
752
  readonly aggregator: CompositeAggregatorConfig;
777
753
  readonly weight?: number;
778
754
  readonly required?: boolean | number;
755
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
756
+ readonly min_score?: number;
779
757
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
780
758
  readonly negate?: boolean;
781
759
  };
@@ -820,6 +798,8 @@ type FieldAccuracyEvaluatorConfig = {
820
798
  readonly aggregation?: FieldAggregationType;
821
799
  readonly weight?: number;
822
800
  readonly required?: boolean | number;
801
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
802
+ readonly min_score?: number;
823
803
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
824
804
  readonly negate?: boolean;
825
805
  };
@@ -834,6 +814,8 @@ type LatencyEvaluatorConfig = {
834
814
  readonly threshold: number;
835
815
  readonly weight?: number;
836
816
  readonly required?: boolean | number;
817
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
818
+ readonly min_score?: number;
837
819
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
838
820
  readonly negate?: boolean;
839
821
  };
@@ -848,6 +830,8 @@ type CostEvaluatorConfig = {
848
830
  readonly budget: number;
849
831
  readonly weight?: number;
850
832
  readonly required?: boolean | number;
833
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
834
+ readonly min_score?: number;
851
835
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
852
836
  readonly negate?: boolean;
853
837
  };
@@ -866,6 +850,8 @@ type TokenUsageEvaluatorConfig = {
866
850
  readonly max_output?: number;
867
851
  readonly weight?: number;
868
852
  readonly required?: boolean | number;
853
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
854
+ readonly min_score?: number;
869
855
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
870
856
  readonly negate?: boolean;
871
857
  };
@@ -893,6 +879,8 @@ type ExecutionMetricsEvaluatorConfig = {
893
879
  readonly exploration_tolerance?: number;
894
880
  readonly weight?: number;
895
881
  readonly required?: boolean | number;
882
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
883
+ readonly min_score?: number;
896
884
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
897
885
  readonly negate?: boolean;
898
886
  };
@@ -906,6 +894,8 @@ type ContainsEvaluatorConfig = {
906
894
  readonly value: string;
907
895
  readonly weight?: number;
908
896
  readonly required?: boolean | number;
897
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
898
+ readonly min_score?: number;
909
899
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
910
900
  readonly negate?: boolean;
911
901
  };
@@ -919,6 +909,8 @@ type ContainsAnyEvaluatorConfig = {
919
909
  readonly value: readonly string[];
920
910
  readonly weight?: number;
921
911
  readonly required?: boolean | number;
912
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
913
+ readonly min_score?: number;
922
914
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
923
915
  readonly negate?: boolean;
924
916
  };
@@ -932,6 +924,8 @@ type ContainsAllEvaluatorConfig = {
932
924
  readonly value: readonly string[];
933
925
  readonly weight?: number;
934
926
  readonly required?: boolean | number;
927
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
928
+ readonly min_score?: number;
935
929
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
936
930
  readonly negate?: boolean;
937
931
  };
@@ -945,6 +939,8 @@ type IcontainsEvaluatorConfig = {
945
939
  readonly value: string;
946
940
  readonly weight?: number;
947
941
  readonly required?: boolean | number;
942
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
943
+ readonly min_score?: number;
948
944
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
949
945
  readonly negate?: boolean;
950
946
  };
@@ -958,6 +954,8 @@ type IcontainsAnyEvaluatorConfig = {
958
954
  readonly value: readonly string[];
959
955
  readonly weight?: number;
960
956
  readonly required?: boolean | number;
957
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
958
+ readonly min_score?: number;
961
959
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
962
960
  readonly negate?: boolean;
963
961
  };
@@ -971,6 +969,8 @@ type IcontainsAllEvaluatorConfig = {
971
969
  readonly value: readonly string[];
972
970
  readonly weight?: number;
973
971
  readonly required?: boolean | number;
972
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
973
+ readonly min_score?: number;
974
974
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
975
975
  readonly negate?: boolean;
976
976
  };
@@ -984,6 +984,8 @@ type StartsWithEvaluatorConfig = {
984
984
  readonly value: string;
985
985
  readonly weight?: number;
986
986
  readonly required?: boolean | number;
987
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
988
+ readonly min_score?: number;
987
989
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
988
990
  readonly negate?: boolean;
989
991
  };
@@ -997,6 +999,8 @@ type EndsWithEvaluatorConfig = {
997
999
  readonly value: string;
998
1000
  readonly weight?: number;
999
1001
  readonly required?: boolean | number;
1002
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1003
+ readonly min_score?: number;
1000
1004
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1001
1005
  readonly negate?: boolean;
1002
1006
  };
@@ -1012,6 +1016,8 @@ type RegexEvaluatorConfig = {
1012
1016
  readonly flags?: string;
1013
1017
  readonly weight?: number;
1014
1018
  readonly required?: boolean | number;
1019
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1020
+ readonly min_score?: number;
1015
1021
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1016
1022
  readonly negate?: boolean;
1017
1023
  };
@@ -1024,6 +1030,8 @@ type IsJsonEvaluatorConfig = {
1024
1030
  readonly type: 'is-json';
1025
1031
  readonly weight?: number;
1026
1032
  readonly required?: boolean | number;
1033
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1034
+ readonly min_score?: number;
1027
1035
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1028
1036
  readonly negate?: boolean;
1029
1037
  };
@@ -1037,6 +1045,8 @@ type EqualsEvaluatorConfig = {
1037
1045
  readonly value: string;
1038
1046
  readonly weight?: number;
1039
1047
  readonly required?: boolean | number;
1048
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1049
+ readonly min_score?: number;
1040
1050
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1041
1051
  readonly negate?: boolean;
1042
1052
  };
@@ -1050,6 +1060,8 @@ type RubricsEvaluatorConfig = {
1050
1060
  readonly criteria: readonly RubricItem[];
1051
1061
  readonly weight?: number;
1052
1062
  readonly required?: boolean | number;
1063
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1064
+ readonly min_score?: number;
1053
1065
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1054
1066
  readonly negate?: boolean;
1055
1067
  };
@@ -1068,6 +1080,8 @@ type SkillTriggerEvaluatorConfig = {
1068
1080
  readonly should_trigger?: boolean;
1069
1081
  readonly weight?: number;
1070
1082
  readonly required?: boolean | number;
1083
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1084
+ readonly min_score?: number;
1071
1085
  readonly negate?: boolean;
1072
1086
  };
1073
1087
  /**
@@ -1079,6 +1093,8 @@ type InlineAssertEvaluatorConfig = {
1079
1093
  readonly type: 'inline-assert';
1080
1094
  readonly weight?: number;
1081
1095
  readonly required?: boolean | number;
1096
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1097
+ readonly min_score?: number;
1082
1098
  readonly negate?: boolean;
1083
1099
  };
1084
1100
  type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | SkillTriggerEvaluatorConfig | ContainsEvaluatorConfig | ContainsAnyEvaluatorConfig | ContainsAllEvaluatorConfig | IcontainsEvaluatorConfig | IcontainsAnyEvaluatorConfig | IcontainsAllEvaluatorConfig | StartsWithEvaluatorConfig | EndsWithEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig | InlineAssertEvaluatorConfig;
@@ -1087,7 +1103,7 @@ type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | Composit
1087
1103
  */
1088
1104
  interface EvalTest {
1089
1105
  readonly id: string;
1090
- readonly dataset?: string;
1106
+ readonly suite?: string;
1091
1107
  readonly category?: string;
1092
1108
  readonly conversation_id?: string;
1093
1109
  readonly question: string;
@@ -1104,6 +1120,8 @@ interface EvalTest {
1104
1120
  readonly metadata?: Record<string, unknown>;
1105
1121
  /** Per-test target override (matrix evaluation) */
1106
1122
  readonly targets?: readonly string[];
1123
+ /** Per-test score threshold override (0-1). Resolution: CLI > test > suite > DEFAULT_THRESHOLD. */
1124
+ readonly threshold?: number;
1107
1125
  }
1108
1126
  /** @deprecated Use `EvalTest` instead */
1109
1127
  type EvalCase = EvalTest;
@@ -1197,7 +1215,7 @@ type FailOnError = boolean;
1197
1215
  interface EvaluationResult {
1198
1216
  readonly timestamp: string;
1199
1217
  readonly testId: string;
1200
- readonly dataset?: string;
1218
+ readonly suite?: string;
1201
1219
  readonly category?: string;
1202
1220
  readonly conversationId?: string;
1203
1221
  readonly score: number;
@@ -1427,8 +1445,8 @@ declare function detectFormat(filePath: string): 'yaml' | 'jsonl' | 'agent-skill
1427
1445
 
1428
1446
  type LoadOptions = {
1429
1447
  readonly verbose?: boolean;
1430
- /** Filter tests by ID pattern (glob supported, e.g., "summary-*") */
1431
- readonly filter?: string;
1448
+ /** Filter tests by ID pattern(s) (glob supported, e.g., "summary-*"). Arrays use OR logic. */
1449
+ readonly filter?: string | readonly string[];
1432
1450
  /** Category derived from the eval file's directory path */
1433
1451
  readonly category?: string;
1434
1452
  };
@@ -1599,7 +1617,7 @@ declare function resolveFileReference(rawValue: string, searchRoots: readonly st
1599
1617
  /**
1600
1618
  * Strict normalized schema for CLI target configuration.
1601
1619
  * This is the final validated shape after environment variable resolution
1602
- * and snake_case to camelCase normalization.
1620
+ * and internal field normalization.
1603
1621
  *
1604
1622
  * Uses .strict() to reject unknown properties, ensuring configuration
1605
1623
  * errors are caught early rather than silently ignored.
@@ -1648,8 +1666,6 @@ declare const CliTargetConfigSchema: z.ZodObject<{
1648
1666
  command: string;
1649
1667
  verbose?: boolean | undefined;
1650
1668
  cwd?: string | undefined;
1651
- filesFormat?: string | undefined;
1652
- workspaceTemplate?: string | undefined;
1653
1669
  healthcheck?: {
1654
1670
  url: string;
1655
1671
  timeoutMs?: number | undefined;
@@ -1658,14 +1674,14 @@ declare const CliTargetConfigSchema: z.ZodObject<{
1658
1674
  cwd?: string | undefined;
1659
1675
  timeoutMs?: number | undefined;
1660
1676
  } | undefined;
1661
- keepTempFiles?: boolean | undefined;
1662
1677
  timeoutMs?: number | undefined;
1678
+ filesFormat?: string | undefined;
1679
+ workspaceTemplate?: string | undefined;
1680
+ keepTempFiles?: boolean | undefined;
1663
1681
  }, {
1664
1682
  command: string;
1665
1683
  verbose?: boolean | undefined;
1666
1684
  cwd?: string | undefined;
1667
- filesFormat?: string | undefined;
1668
- workspaceTemplate?: string | undefined;
1669
1685
  healthcheck?: {
1670
1686
  url: string;
1671
1687
  timeoutMs?: number | undefined;
@@ -1674,8 +1690,10 @@ declare const CliTargetConfigSchema: z.ZodObject<{
1674
1690
  cwd?: string | undefined;
1675
1691
  timeoutMs?: number | undefined;
1676
1692
  } | undefined;
1677
- keepTempFiles?: boolean | undefined;
1678
1693
  timeoutMs?: number | undefined;
1694
+ filesFormat?: string | undefined;
1695
+ workspaceTemplate?: string | undefined;
1696
+ keepTempFiles?: boolean | undefined;
1679
1697
  }>;
1680
1698
  type CliNormalizedConfig = z.infer<typeof CliTargetConfigSchema>;
1681
1699
  /**
@@ -1707,6 +1725,7 @@ interface AzureResolvedConfig {
1707
1725
  readonly deploymentName: string;
1708
1726
  readonly apiKey: string;
1709
1727
  readonly version?: string;
1728
+ readonly apiFormat?: ApiFormat;
1710
1729
  readonly temperature?: number;
1711
1730
  readonly maxOutputTokens?: number;
1712
1731
  readonly retry?: RetryConfig;
@@ -1931,15 +1950,20 @@ type ResolvedTarget = (ResolvedTargetBase & {
1931
1950
  }) | (ResolvedTargetBase & {
1932
1951
  readonly kind: 'cli';
1933
1952
  readonly config: CliResolvedConfig;
1953
+ }) | (ResolvedTargetBase & {
1954
+ readonly kind: 'transcript';
1955
+ readonly config: Record<string, never>;
1934
1956
  });
1935
1957
  /**
1936
1958
  * Optional settings accepted on ALL target definitions regardless of provider.
1937
1959
  * Exported so the targets validator can reuse the same list — adding a field
1938
1960
  * here automatically makes it valid in targets.yaml without a separate update.
1939
1961
  */
1940
- declare const COMMON_TARGET_SETTINGS: readonly ["use_target", "provider_batching", "providerBatching", "subagent_mode_allowed", "subagentModeAllowed", "fallback_targets", "fallbackTargets"];
1962
+ declare const COMMON_TARGET_SETTINGS: readonly ["use_target", "provider_batching", "subagent_mode_allowed", "fallback_targets"];
1941
1963
  declare function resolveDelegatedTargetDefinition(name: string, definitions: ReadonlyMap<string, TargetDefinition>, env?: EnvLookup): TargetDefinition | undefined;
1942
- declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string): ResolvedTarget;
1964
+ declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string, options?: {
1965
+ readonly emitDeprecationWarnings?: boolean;
1966
+ }): ResolvedTarget;
1943
1967
 
1944
1968
  /**
1945
1969
  * Extensible provider registry.
@@ -2204,19 +2228,25 @@ interface EvaluatorFactory {
2204
2228
  *
2205
2229
  * Scoring model:
2206
2230
  * score ∈ [0, 1] — continuous quality signal
2207
- * verdict — binary classification derived from score via PASS_THRESHOLD
2231
+ * verdict — binary classification derived from score via threshold
2208
2232
  *
2209
- * score >= PASS_THRESHOLD → 'pass'
2210
- * score < PASS_THRESHOLD → 'fail'
2233
+ * score >= threshold → 'pass'
2234
+ * score < threshold → 'fail'
2211
2235
  * (infrastructure skip) → 'skip'
2212
2236
  *
2213
- * To change the pass/fail boundary, update PASS_THRESHOLD.
2214
- * All verdict derivation flows through scoreToVerdict().
2237
+ * Scoring scale principle:
2238
+ * All user-configurable score thresholds use 0-1 scale.
2239
+ * The only 0-10 values in YAML are `score_ranges` which define LLM integer output band labels.
2240
+ *
2241
+ * Default threshold is 0.8. Override via CLI `--threshold`, suite `execution.threshold`,
2242
+ * or per-test `execution.threshold`. All verdict derivation flows through scoreToVerdict().
2215
2243
  */
2216
2244
 
2217
- /** Score threshold for pass verdict. Scores below this are fail. */
2245
+ /** Default score threshold for pass verdict (0-1). Scores below this are fail. */
2246
+ declare const DEFAULT_THRESHOLD = 0.8;
2247
+ /** @deprecated Use DEFAULT_THRESHOLD instead. */
2218
2248
  declare const PASS_THRESHOLD = 0.8;
2219
- declare function scoreToVerdict(score: number): EvaluationVerdict;
2249
+ declare function scoreToVerdict(score: number, threshold?: number): EvaluationVerdict;
2220
2250
  declare function clampScore(value: number): number;
2221
2251
  declare function extractJsonBlob(text: string): string | undefined;
2222
2252
  declare function parseJsonFromText(text: string): unknown;
@@ -2499,6 +2529,7 @@ declare class LlmGraderEvaluator implements Evaluator {
2499
2529
  private buildScoreRangePrompt;
2500
2530
  private buildRubricPrompt;
2501
2531
  private runWithRetry;
2532
+ private generateStructuredResponse;
2502
2533
  }
2503
2534
  /**
2504
2535
  * Build the mandatory output schema that all evaluators must follow.
@@ -2837,8 +2868,8 @@ interface RunEvaluationOptions {
2837
2868
  readonly cache?: EvaluationCache;
2838
2869
  readonly useCache?: boolean;
2839
2870
  readonly now?: () => Date;
2840
- /** Filter tests by ID pattern (glob supported, e.g., "summary-*") */
2841
- readonly filter?: string;
2871
+ /** Filter tests by ID pattern(s) (glob supported, e.g., "summary-*"). Arrays use OR logic. */
2872
+ readonly filter?: string | readonly string[];
2842
2873
  readonly verbose?: boolean;
2843
2874
  readonly maxConcurrency?: number;
2844
2875
  readonly evalCases?: readonly EvalTest[];
@@ -3008,6 +3039,8 @@ interface EvalAssertionInput {
3008
3039
  readonly weight?: number;
3009
3040
  /** Whether this assertion is required to pass */
3010
3041
  readonly required?: boolean | number;
3042
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
3043
+ readonly min_score?: number;
3011
3044
  /** Prompt file for llm_grader */
3012
3045
  readonly prompt?: string;
3013
3046
  /** Script for code_grader */
@@ -3042,8 +3075,8 @@ interface EvalConfig {
3042
3075
  readonly task?: (input: string) => string | Promise<string>;
3043
3076
  /** Suite-level assertions applied to all tests */
3044
3077
  readonly assert?: readonly AssertEntry[];
3045
- /** Filter tests by ID pattern (glob supported) */
3046
- readonly filter?: string;
3078
+ /** Filter tests by ID pattern(s) (glob supported). Arrays use OR logic. */
3079
+ readonly filter?: string | readonly string[];
3047
3080
  /** Maximum concurrent workers (default: 3) */
3048
3081
  readonly workers?: number;
3049
3082
  /** Maximum retries on failure (default: 2) */
@@ -3056,6 +3089,8 @@ interface EvalConfig {
3056
3089
  readonly verbose?: boolean;
3057
3090
  /** Callback for each completed result */
3058
3091
  readonly onResult?: (result: EvaluationResult) => void;
3092
+ /** Score threshold for pass/fail (0-1). Default: 0.8 (DEFAULT_THRESHOLD). */
3093
+ readonly threshold?: number;
3059
3094
  }
3060
3095
  /**
3061
3096
  * Summary statistics for an evaluation run.
@@ -3063,9 +3098,9 @@ interface EvalConfig {
3063
3098
  interface EvalSummary {
3064
3099
  /** Total number of test cases */
3065
3100
  readonly total: number;
3066
- /** Number of passing test cases (score >= PASS_THRESHOLD) */
3101
+ /** Number of passing test cases (score >= threshold) */
3067
3102
  readonly passed: number;
3068
- /** Number of failing test cases (score < PASS_THRESHOLD) */
3103
+ /** Number of failing test cases (score < threshold) */
3069
3104
  readonly failed: number;
3070
3105
  /** Total duration in milliseconds */
3071
3106
  readonly durationMs: number;
@@ -3505,7 +3540,7 @@ declare class WorkspacePoolManager {
3505
3540
  private removeAllSlots;
3506
3541
  /**
3507
3542
  * Reset an existing slot for reuse:
3508
- * 1. Reset repos (git reset --hard {ref} && git clean -fd per repo)
3543
+ * 1. Reset repos (fetch from origin when resolve=remote, then git reset --hard && git clean per repo)
3509
3544
  * 2. Re-copy template files (skip repo directories)
3510
3545
  */
3511
3546
  private resetSlot;
@@ -3811,15 +3846,21 @@ declare function discoverGraders(registry: EvaluatorRegistry, baseDir: string):
3811
3846
  /**
3812
3847
  * Core types for the transcript import pipeline.
3813
3848
  *
3814
- * A TranscriptEntry represents a single event in a parsed agent session
3815
- * transcript (user message, assistant response, tool call, etc.).
3849
+ * A TranscriptEntry is the internal (camelCase) representation of a parsed
3850
+ * session. A TranscriptJsonLine is the on-disk (snake_case) wire format
3851
+ * written to .agentv/transcripts/*.jsonl files.
3852
+ *
3853
+ * Flow:
3854
+ * raw session JSONL → parser → TranscriptEntry (internal)
3855
+ * TranscriptEntry → toTranscriptJsonLine() → JSONL on disk
3856
+ * JSONL on disk → readTranscriptJsonl() → TranscriptJsonLine[]
3816
3857
  *
3817
- * A TranscriptSource describes where a transcript came from (provider,
3818
- * session ID, file path, etc.).
3858
+ * To add a new importer: write a parser that returns TranscriptEntry,
3859
+ * then use toTranscriptJsonLine() to serialize.
3819
3860
  */
3820
3861
 
3821
3862
  /**
3822
- * A parsed transcript: ordered messages plus session metadata.
3863
+ * A parsed transcript: ordered messages plus session metadata (internal camelCase).
3823
3864
  */
3824
3865
  interface TranscriptEntry {
3825
3866
  readonly messages: Message[];
@@ -3829,7 +3870,7 @@ interface TranscriptEntry {
3829
3870
  readonly costUsd?: number | null;
3830
3871
  }
3831
3872
  /**
3832
- * Metadata describing the origin of a transcript.
3873
+ * Metadata describing the origin of a transcript (internal camelCase).
3833
3874
  */
3834
3875
  interface TranscriptSource {
3835
3876
  readonly provider: string;
@@ -3837,7 +3878,45 @@ interface TranscriptSource {
3837
3878
  readonly projectPath?: string;
3838
3879
  readonly startedAt?: string;
3839
3880
  readonly model?: string;
3881
+ readonly version?: string;
3882
+ readonly gitBranch?: string;
3883
+ readonly cwd?: string;
3884
+ }
3885
+ /**
3886
+ * One line in a transcript JSONL file (snake_case wire format).
3887
+ *
3888
+ * Each line is a self-contained test case with pre-populated output.
3889
+ * The `input` field is the first user message; the `output` field is the
3890
+ * full conversation (Message[]).
3891
+ */
3892
+ interface TranscriptJsonLine {
3893
+ readonly input: string;
3894
+ readonly output: readonly Message[];
3895
+ readonly token_usage?: {
3896
+ readonly input: number;
3897
+ readonly output: number;
3898
+ readonly cached?: number;
3899
+ };
3900
+ readonly duration_ms?: number;
3901
+ readonly cost_usd?: number | null;
3902
+ readonly source: {
3903
+ readonly provider: string;
3904
+ readonly session_id: string;
3905
+ readonly model?: string;
3906
+ readonly timestamp?: string;
3907
+ readonly git_branch?: string;
3908
+ readonly cwd?: string;
3909
+ readonly version?: string;
3910
+ };
3840
3911
  }
3912
+ /**
3913
+ * Convert a parsed TranscriptEntry to the on-disk JSONL wire format.
3914
+ */
3915
+ declare function toTranscriptJsonLine(entry: TranscriptEntry): TranscriptJsonLine;
3916
+ /**
3917
+ * Read a transcript JSONL file and parse each line into a TranscriptJsonLine.
3918
+ */
3919
+ declare function readTranscriptJsonl(filePath: string): Promise<TranscriptJsonLine[]>;
3841
3920
  /**
3842
3921
  * Read a JSONL transcript file and return its raw text.
3843
3922
  * Throws if the file does not exist or cannot be read.
@@ -3871,6 +3950,70 @@ declare function readTranscriptFile(filePath: string): Promise<string>;
3871
3950
 
3872
3951
  declare function parseClaudeSession(jsonl: string): TranscriptEntry;
3873
3952
 
3953
+ /**
3954
+ * Codex CLI session JSONL parser.
3955
+ *
3956
+ * Reads a Codex CLI rollout transcript
3957
+ * (~/.codex/sessions/YYYY/MM/DD/rollout-*.jsonl) and converts it to AgentV's
3958
+ * Message[] format.
3959
+ *
3960
+ * Each line is a JSON object with one of these top-level types:
3961
+ * session_meta → session metadata (id, cwd, cli_version, model)
3962
+ * turn_context → per-turn context (model, cwd, turn_id)
3963
+ * event_msg → events: task_started, task_complete, user_message,
3964
+ * agent_message, token_count
3965
+ * response_item → conversation items: message, function_call,
3966
+ * function_call_output, reasoning, custom_tool_call,
3967
+ * custom_tool_call_output
3968
+ *
3969
+ * Key behaviors:
3970
+ * - response_item with type=message and role=user → user Message
3971
+ * - response_item with type=message and role=assistant → assistant Message
3972
+ * - response_item with type=function_call → ToolCall (pending output)
3973
+ * - response_item with type=function_call_output → matched to pending call by call_id
3974
+ * - response_item with type=reasoning → skipped (thinking tokens)
3975
+ * - response_item with role=developer → skipped (system prompt)
3976
+ * - session_meta → source metadata (session_id, cwd, version, model)
3977
+ * - turn_context → model name extraction
3978
+ * - Duration is from first↔last event timestamp
3979
+ * - cost_usd is null (Codex CLI does not report per-session cost)
3980
+ * - Token usage not available from rollout format (rate limit info only)
3981
+ *
3982
+ * To add a new response_item type: add a case to the switch in parseCodexSession().
3983
+ */
3984
+
3985
+ declare function parseCodexSession(jsonl: string): TranscriptEntry;
3986
+
3987
+ /**
3988
+ * Codex CLI session discovery.
3989
+ *
3990
+ * Scans ~/.codex/sessions/ for rollout JSONL files. Codex CLI stores sessions at:
3991
+ * ~/.codex/sessions/YYYY/MM/DD/rollout-<timestamp>-<uuid>.jsonl
3992
+ *
3993
+ * Sessions are returned sorted by modification time (most recent first).
3994
+ */
3995
+ interface CodexSession {
3996
+ /** UUID from the filename */
3997
+ readonly sessionId: string;
3998
+ /** Full path to the JSONL file */
3999
+ readonly filePath: string;
4000
+ /** Filename (e.g., rollout-2026-03-29T14-22-01-<uuid>.jsonl) */
4001
+ readonly filename: string;
4002
+ /** Last modification time */
4003
+ readonly updatedAt: Date;
4004
+ }
4005
+ interface CodexDiscoverOptions {
4006
+ /** Filter by date string (YYYY-MM-DD). */
4007
+ readonly date?: string;
4008
+ /** Maximum number of sessions to return (default: 10). */
4009
+ readonly limit?: number;
4010
+ /** Override the default ~/.codex/sessions directory. */
4011
+ readonly sessionsDir?: string;
4012
+ /** Return only the most recent session. */
4013
+ readonly latest?: boolean;
4014
+ }
4015
+ declare function discoverCodexSessions(opts?: CodexDiscoverOptions): Promise<CodexSession[]>;
4016
+
3874
4017
  /**
3875
4018
  * Claude Code session discovery.
3876
4019
  *
@@ -3907,9 +4050,80 @@ interface ClaudeDiscoverOptions {
3907
4050
  }
3908
4051
  declare function discoverClaudeSessions(opts?: ClaudeDiscoverOptions): Promise<ClaudeSession[]>;
3909
4052
 
4053
+ /**
4054
+ * Transcript provider — replays pre-recorded session transcripts through the
4055
+ * evaluation pipeline without invoking any live agent.
4056
+ *
4057
+ * Used by `agentv eval --transcript <file>` to grade imported sessions.
4058
+ *
4059
+ * How it works:
4060
+ * 1. Reads a transcript JSONL file (produced by `agentv import`)
4061
+ * 2. Each invocation pops the next line from the transcript
4062
+ * 3. Returns a ProviderResponse with pre-populated output, token usage, etc.
4063
+ * 4. Evaluators run identically to live eval — they see the same ProviderResponse
4064
+ *
4065
+ * The provider name in results is set to the source provider from the transcript
4066
+ * (e.g., "claude", "codex", "copilot").
4067
+ */
4068
+
4069
+ declare class TranscriptProvider implements Provider {
4070
+ readonly id: string;
4071
+ readonly kind: "transcript";
4072
+ readonly targetName: string;
4073
+ private lines;
4074
+ private cursor;
4075
+ constructor(targetName: string, lines: TranscriptJsonLine[]);
4076
+ /**
4077
+ * Create a TranscriptProvider from a JSONL file path.
4078
+ */
4079
+ static fromFile(filePath: string): Promise<TranscriptProvider>;
4080
+ get lineCount(): number;
4081
+ invoke(_request: ProviderRequest): Promise<ProviderResponse>;
4082
+ }
4083
+
4084
+ /**
4085
+ * Copilot CLI events.jsonl parser.
4086
+ *
4087
+ * Reads a Copilot CLI session transcript (events.jsonl) and converts it to
4088
+ * AgentV's Message[] format. Each line is a JSON object with:
4089
+ * { type, data: { ...payload }, id, timestamp, parentId }
4090
+ *
4091
+ * All event-specific fields live under event.data.*, while type, id, timestamp,
4092
+ * and parentId are at the top level.
4093
+ *
4094
+ * Supported event types:
4095
+ * session.start → session metadata (data.sessionId, data.context.cwd)
4096
+ * user.message → Message { role: 'user' }
4097
+ * assistant.message → Message { role: 'assistant', toolCalls from data.toolRequests }
4098
+ * skill.invoked → ToolCall { tool: 'Skill', input: { skill: data.name } }
4099
+ * tool.execution_start + tool.execution_complete → ToolCall with output
4100
+ * session.shutdown → token usage from data.modelMetrics, end timestamp
4101
+ *
4102
+ * To add a new event type:
4103
+ * 1. Add a case to the switch in parseCopilotEvents()
4104
+ * 2. Map it to a Message or ToolCall
4105
+ * 3. Add a test in copilot-log-parser.test.ts
4106
+ */
4107
+
4108
+ interface CopilotSessionMeta {
4109
+ readonly sessionId: string;
4110
+ readonly model: string;
4111
+ readonly cwd: string;
4112
+ readonly repository?: string;
4113
+ readonly branch?: string;
4114
+ readonly startedAt?: string;
4115
+ }
4116
+ interface ParsedCopilotSession {
4117
+ readonly messages: Message[];
4118
+ readonly meta: CopilotSessionMeta;
4119
+ readonly tokenUsage?: ProviderTokenUsage;
4120
+ readonly durationMs?: number;
4121
+ }
4122
+ declare function parseCopilotEvents(eventsJsonl: string): ParsedCopilotSession;
4123
+
3910
4124
  type AgentKernel = {
3911
4125
  status: string;
3912
4126
  };
3913
4127
  declare function createAgentKernel(): AgentKernel;
3914
4128
 
3915
- export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentText, type CopilotCliResolvedConfig, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
4129
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };