@agentv/core 4.6.1 → 4.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -73,7 +73,7 @@ interface ChatMessage {
73
73
  readonly name?: string;
74
74
  }
75
75
  type ChatPrompt = readonly ChatMessage[];
76
- type ProviderKind = 'openai' | 'openrouter' | 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-sdk' | 'copilot-cli' | 'copilot-log' | 'pi-coding-agent' | 'pi-cli' | 'claude' | 'claude-cli' | 'claude-sdk' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders' | 'agentv';
76
+ type ProviderKind = 'openai' | 'openrouter' | 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-sdk' | 'copilot-cli' | 'copilot-log' | 'pi-coding-agent' | 'pi-cli' | 'claude' | 'claude-cli' | 'claude-sdk' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders' | 'agentv' | 'transcript';
77
77
  /** Callbacks for real-time observability during provider execution */
78
78
  interface ProviderStreamCallbacks {
79
79
  onToolCallStart?: (toolName: string, toolCallId?: string) => void;
@@ -222,25 +222,19 @@ interface TargetDefinition {
222
222
  readonly judge_target?: string | undefined;
223
223
  readonly workers?: number | undefined;
224
224
  readonly provider_batching?: boolean | undefined;
225
- readonly providerBatching?: boolean | undefined;
225
+ readonly subagent_mode_allowed?: boolean | undefined;
226
226
  readonly endpoint?: string | unknown | undefined;
227
227
  readonly base_url?: string | unknown | undefined;
228
- readonly baseUrl?: string | unknown | undefined;
229
228
  readonly resource?: string | unknown | undefined;
230
- readonly resourceName?: string | unknown | undefined;
231
229
  readonly api_key?: string | unknown | undefined;
232
- readonly apiKey?: string | unknown | undefined;
233
230
  readonly deployment?: string | unknown | undefined;
234
- readonly deploymentName?: string | unknown | undefined;
235
231
  readonly model?: string | unknown | undefined;
236
232
  readonly version?: string | unknown | undefined;
237
233
  readonly api_version?: string | unknown | undefined;
238
234
  readonly variant?: string | unknown | undefined;
239
235
  readonly thinking_budget?: number | unknown | undefined;
240
- readonly thinkingBudget?: number | unknown | undefined;
241
236
  readonly temperature?: number | unknown | undefined;
242
237
  readonly max_output_tokens?: number | unknown | undefined;
243
- readonly maxTokens?: number | unknown | undefined;
244
238
  readonly executable?: string | unknown | undefined;
245
239
  readonly command?: string | unknown | undefined;
246
240
  readonly binary?: string | unknown | undefined;
@@ -248,63 +242,36 @@ interface TargetDefinition {
248
242
  readonly arguments?: unknown | undefined;
249
243
  readonly cwd?: string | unknown | undefined;
250
244
  readonly timeout_seconds?: number | unknown | undefined;
251
- readonly timeoutSeconds?: number | unknown | undefined;
252
245
  readonly log_dir?: string | unknown | undefined;
253
- readonly logDir?: string | unknown | undefined;
254
246
  readonly log_directory?: string | unknown | undefined;
255
- readonly logDirectory?: string | unknown | undefined;
256
247
  readonly log_format?: string | unknown | undefined;
257
- readonly logFormat?: string | unknown | undefined;
258
248
  readonly log_output_format?: string | unknown | undefined;
259
- readonly logOutputFormat?: string | unknown | undefined;
260
249
  readonly system_prompt?: string | unknown | undefined;
261
- readonly systemPrompt?: string | unknown | undefined;
262
250
  readonly max_turns?: number | unknown | undefined;
263
- readonly maxTurns?: number | unknown | undefined;
264
251
  readonly max_budget_usd?: number | unknown | undefined;
265
- readonly maxBudgetUsd?: number | unknown | undefined;
266
252
  readonly response?: string | unknown | undefined;
267
- readonly delayMs?: number | unknown | undefined;
268
- readonly delayMinMs?: number | unknown | undefined;
269
- readonly delayMaxMs?: number | unknown | undefined;
270
253
  readonly wait?: boolean | unknown | undefined;
271
254
  readonly dry_run?: boolean | unknown | undefined;
272
- readonly dryRun?: boolean | unknown | undefined;
273
255
  readonly subagent_root?: string | unknown | undefined;
274
- readonly subagentRoot?: string | unknown | undefined;
275
256
  readonly workspace_template?: string | unknown | undefined;
276
- readonly workspaceTemplate?: string | unknown | undefined;
277
257
  readonly files_format?: string | unknown | undefined;
278
- readonly filesFormat?: string | unknown | undefined;
279
258
  readonly attachments_format?: string | unknown | undefined;
280
- readonly attachmentsFormat?: string | unknown | undefined;
281
259
  readonly env?: unknown | undefined;
282
260
  readonly healthcheck?: unknown | undefined;
283
261
  readonly session_dir?: string | unknown | undefined;
284
- readonly sessionDir?: string | unknown | undefined;
285
262
  readonly session_id?: string | unknown | undefined;
286
- readonly sessionId?: string | unknown | undefined;
287
263
  readonly discover?: string | unknown | undefined;
288
264
  readonly session_state_dir?: string | unknown | undefined;
289
- readonly sessionStateDir?: string | unknown | undefined;
290
265
  readonly cli_url?: string | unknown | undefined;
291
- readonly cliUrl?: string | unknown | undefined;
292
266
  readonly cli_path?: string | unknown | undefined;
293
- readonly cliPath?: string | unknown | undefined;
294
267
  readonly github_token?: string | unknown | undefined;
295
- readonly githubToken?: string | unknown | undefined;
268
+ readonly byok?: Record<string, unknown> | undefined;
296
269
  readonly max_retries?: number | unknown | undefined;
297
- readonly maxRetries?: number | unknown | undefined;
298
270
  readonly retry_initial_delay_ms?: number | unknown | undefined;
299
- readonly retryInitialDelayMs?: number | unknown | undefined;
300
271
  readonly retry_max_delay_ms?: number | unknown | undefined;
301
- readonly retryMaxDelayMs?: number | unknown | undefined;
302
272
  readonly retry_backoff_factor?: number | unknown | undefined;
303
- readonly retryBackoffFactor?: number | unknown | undefined;
304
273
  readonly retry_status_codes?: unknown | undefined;
305
- readonly retryStatusCodes?: unknown | undefined;
306
274
  readonly fallback_targets?: readonly string[] | unknown | undefined;
307
- readonly fallbackTargets?: readonly string[] | unknown | undefined;
308
275
  }
309
276
 
310
277
  /**
@@ -375,6 +342,8 @@ interface ToolTrajectoryEvaluatorConfig {
375
342
  /** Optional weight for top-level aggregation (defaults to 1.0) */
376
343
  readonly weight?: number;
377
344
  readonly required?: boolean | number;
345
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
346
+ readonly min_score?: number;
378
347
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
379
348
  readonly negate?: boolean;
380
349
  /** Default argument matching mode for all expected items (defaults to 'exact') */
@@ -667,6 +636,8 @@ type CodeEvaluatorConfig = {
667
636
  readonly resolvedCwd?: string;
668
637
  readonly weight?: number;
669
638
  readonly required?: boolean | number;
639
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
640
+ readonly min_score?: number;
670
641
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
671
642
  readonly negate?: boolean;
672
643
  /** Pass-through configuration for the code-grader (any unrecognized YAML properties) */
@@ -699,6 +670,8 @@ type LlmGraderEvaluatorConfig = {
699
670
  readonly rubrics?: readonly RubricItem[];
700
671
  readonly weight?: number;
701
672
  readonly required?: boolean | number;
673
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
674
+ readonly min_score?: number;
702
675
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
703
676
  readonly negate?: boolean;
704
677
  /** Optional target override for this grader (uses a named LLM target from targets.yaml). */
@@ -737,13 +710,17 @@ type RubricItem = {
737
710
  readonly outcome?: string;
738
711
  readonly weight: number;
739
712
  /**
740
- * Legacy boolean gating (deprecated, treated as required_min_score: 10).
741
- * Use required_min_score instead for finer control.
713
+ * Legacy boolean gating (treated as min_score: 1.0 for score-range rubrics).
742
714
  */
743
715
  readonly required?: boolean;
744
716
  /**
745
- * Minimum score (0-10) required to pass this criterion.
746
- * If the criterion score is below this threshold, the overall verdict is 'fail'.
717
+ * Minimum score (0-1 scale) required to pass this criterion.
718
+ * Internally compared against normalized score (rawScore / 10).
719
+ */
720
+ readonly min_score?: number;
721
+ /**
722
+ * @deprecated Use min_score (0-1 scale) instead.
723
+ * Legacy: minimum score on 0-10 integer scale.
747
724
  */
748
725
  readonly required_min_score?: number;
749
726
  /**
@@ -776,6 +753,8 @@ type CompositeEvaluatorConfig = {
776
753
  readonly aggregator: CompositeAggregatorConfig;
777
754
  readonly weight?: number;
778
755
  readonly required?: boolean | number;
756
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
757
+ readonly min_score?: number;
779
758
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
780
759
  readonly negate?: boolean;
781
760
  };
@@ -820,6 +799,8 @@ type FieldAccuracyEvaluatorConfig = {
820
799
  readonly aggregation?: FieldAggregationType;
821
800
  readonly weight?: number;
822
801
  readonly required?: boolean | number;
802
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
803
+ readonly min_score?: number;
823
804
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
824
805
  readonly negate?: boolean;
825
806
  };
@@ -834,6 +815,8 @@ type LatencyEvaluatorConfig = {
834
815
  readonly threshold: number;
835
816
  readonly weight?: number;
836
817
  readonly required?: boolean | number;
818
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
819
+ readonly min_score?: number;
837
820
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
838
821
  readonly negate?: boolean;
839
822
  };
@@ -848,6 +831,8 @@ type CostEvaluatorConfig = {
848
831
  readonly budget: number;
849
832
  readonly weight?: number;
850
833
  readonly required?: boolean | number;
834
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
835
+ readonly min_score?: number;
851
836
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
852
837
  readonly negate?: boolean;
853
838
  };
@@ -866,6 +851,8 @@ type TokenUsageEvaluatorConfig = {
866
851
  readonly max_output?: number;
867
852
  readonly weight?: number;
868
853
  readonly required?: boolean | number;
854
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
855
+ readonly min_score?: number;
869
856
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
870
857
  readonly negate?: boolean;
871
858
  };
@@ -893,6 +880,8 @@ type ExecutionMetricsEvaluatorConfig = {
893
880
  readonly exploration_tolerance?: number;
894
881
  readonly weight?: number;
895
882
  readonly required?: boolean | number;
883
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
884
+ readonly min_score?: number;
896
885
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
897
886
  readonly negate?: boolean;
898
887
  };
@@ -906,6 +895,8 @@ type ContainsEvaluatorConfig = {
906
895
  readonly value: string;
907
896
  readonly weight?: number;
908
897
  readonly required?: boolean | number;
898
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
899
+ readonly min_score?: number;
909
900
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
910
901
  readonly negate?: boolean;
911
902
  };
@@ -919,6 +910,8 @@ type ContainsAnyEvaluatorConfig = {
919
910
  readonly value: readonly string[];
920
911
  readonly weight?: number;
921
912
  readonly required?: boolean | number;
913
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
914
+ readonly min_score?: number;
922
915
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
923
916
  readonly negate?: boolean;
924
917
  };
@@ -932,6 +925,8 @@ type ContainsAllEvaluatorConfig = {
932
925
  readonly value: readonly string[];
933
926
  readonly weight?: number;
934
927
  readonly required?: boolean | number;
928
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
929
+ readonly min_score?: number;
935
930
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
936
931
  readonly negate?: boolean;
937
932
  };
@@ -945,6 +940,8 @@ type IcontainsEvaluatorConfig = {
945
940
  readonly value: string;
946
941
  readonly weight?: number;
947
942
  readonly required?: boolean | number;
943
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
944
+ readonly min_score?: number;
948
945
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
949
946
  readonly negate?: boolean;
950
947
  };
@@ -958,6 +955,8 @@ type IcontainsAnyEvaluatorConfig = {
958
955
  readonly value: readonly string[];
959
956
  readonly weight?: number;
960
957
  readonly required?: boolean | number;
958
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
959
+ readonly min_score?: number;
961
960
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
962
961
  readonly negate?: boolean;
963
962
  };
@@ -971,6 +970,8 @@ type IcontainsAllEvaluatorConfig = {
971
970
  readonly value: readonly string[];
972
971
  readonly weight?: number;
973
972
  readonly required?: boolean | number;
973
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
974
+ readonly min_score?: number;
974
975
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
975
976
  readonly negate?: boolean;
976
977
  };
@@ -984,6 +985,8 @@ type StartsWithEvaluatorConfig = {
984
985
  readonly value: string;
985
986
  readonly weight?: number;
986
987
  readonly required?: boolean | number;
988
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
989
+ readonly min_score?: number;
987
990
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
988
991
  readonly negate?: boolean;
989
992
  };
@@ -997,6 +1000,8 @@ type EndsWithEvaluatorConfig = {
997
1000
  readonly value: string;
998
1001
  readonly weight?: number;
999
1002
  readonly required?: boolean | number;
1003
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1004
+ readonly min_score?: number;
1000
1005
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1001
1006
  readonly negate?: boolean;
1002
1007
  };
@@ -1012,6 +1017,8 @@ type RegexEvaluatorConfig = {
1012
1017
  readonly flags?: string;
1013
1018
  readonly weight?: number;
1014
1019
  readonly required?: boolean | number;
1020
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1021
+ readonly min_score?: number;
1015
1022
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1016
1023
  readonly negate?: boolean;
1017
1024
  };
@@ -1024,6 +1031,8 @@ type IsJsonEvaluatorConfig = {
1024
1031
  readonly type: 'is-json';
1025
1032
  readonly weight?: number;
1026
1033
  readonly required?: boolean | number;
1034
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1035
+ readonly min_score?: number;
1027
1036
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1028
1037
  readonly negate?: boolean;
1029
1038
  };
@@ -1037,6 +1046,8 @@ type EqualsEvaluatorConfig = {
1037
1046
  readonly value: string;
1038
1047
  readonly weight?: number;
1039
1048
  readonly required?: boolean | number;
1049
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1050
+ readonly min_score?: number;
1040
1051
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1041
1052
  readonly negate?: boolean;
1042
1053
  };
@@ -1050,6 +1061,8 @@ type RubricsEvaluatorConfig = {
1050
1061
  readonly criteria: readonly RubricItem[];
1051
1062
  readonly weight?: number;
1052
1063
  readonly required?: boolean | number;
1064
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1065
+ readonly min_score?: number;
1053
1066
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
1054
1067
  readonly negate?: boolean;
1055
1068
  };
@@ -1068,6 +1081,8 @@ type SkillTriggerEvaluatorConfig = {
1068
1081
  readonly should_trigger?: boolean;
1069
1082
  readonly weight?: number;
1070
1083
  readonly required?: boolean | number;
1084
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1085
+ readonly min_score?: number;
1071
1086
  readonly negate?: boolean;
1072
1087
  };
1073
1088
  /**
@@ -1079,6 +1094,8 @@ type InlineAssertEvaluatorConfig = {
1079
1094
  readonly type: 'inline-assert';
1080
1095
  readonly weight?: number;
1081
1096
  readonly required?: boolean | number;
1097
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
1098
+ readonly min_score?: number;
1082
1099
  readonly negate?: boolean;
1083
1100
  };
1084
1101
  type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | SkillTriggerEvaluatorConfig | ContainsEvaluatorConfig | ContainsAnyEvaluatorConfig | ContainsAllEvaluatorConfig | IcontainsEvaluatorConfig | IcontainsAnyEvaluatorConfig | IcontainsAllEvaluatorConfig | StartsWithEvaluatorConfig | EndsWithEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig | InlineAssertEvaluatorConfig;
@@ -1087,7 +1104,7 @@ type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | Composit
1087
1104
  */
1088
1105
  interface EvalTest {
1089
1106
  readonly id: string;
1090
- readonly dataset?: string;
1107
+ readonly suite?: string;
1091
1108
  readonly category?: string;
1092
1109
  readonly conversation_id?: string;
1093
1110
  readonly question: string;
@@ -1104,6 +1121,8 @@ interface EvalTest {
1104
1121
  readonly metadata?: Record<string, unknown>;
1105
1122
  /** Per-test target override (matrix evaluation) */
1106
1123
  readonly targets?: readonly string[];
1124
+ /** Per-test score threshold override (0-1). Resolution: CLI > test > suite > DEFAULT_THRESHOLD. */
1125
+ readonly threshold?: number;
1107
1126
  }
1108
1127
  /** @deprecated Use `EvalTest` instead */
1109
1128
  type EvalCase = EvalTest;
@@ -1197,7 +1216,7 @@ type FailOnError = boolean;
1197
1216
  interface EvaluationResult {
1198
1217
  readonly timestamp: string;
1199
1218
  readonly testId: string;
1200
- readonly dataset?: string;
1219
+ readonly suite?: string;
1201
1220
  readonly category?: string;
1202
1221
  readonly conversationId?: string;
1203
1222
  readonly score: number;
@@ -1427,8 +1446,8 @@ declare function detectFormat(filePath: string): 'yaml' | 'jsonl' | 'agent-skill
1427
1446
 
1428
1447
  type LoadOptions = {
1429
1448
  readonly verbose?: boolean;
1430
- /** Filter tests by ID pattern (glob supported, e.g., "summary-*") */
1431
- readonly filter?: string;
1449
+ /** Filter tests by ID pattern(s) (glob supported, e.g., "summary-*"). Arrays use OR logic. */
1450
+ readonly filter?: string | readonly string[];
1432
1451
  /** Category derived from the eval file's directory path */
1433
1452
  readonly category?: string;
1434
1453
  };
@@ -1599,7 +1618,7 @@ declare function resolveFileReference(rawValue: string, searchRoots: readonly st
1599
1618
  /**
1600
1619
  * Strict normalized schema for CLI target configuration.
1601
1620
  * This is the final validated shape after environment variable resolution
1602
- * and snake_case to camelCase normalization.
1621
+ * and internal field normalization.
1603
1622
  *
1604
1623
  * Uses .strict() to reject unknown properties, ensuring configuration
1605
1624
  * errors are caught early rather than silently ignored.
@@ -1648,8 +1667,6 @@ declare const CliTargetConfigSchema: z.ZodObject<{
1648
1667
  command: string;
1649
1668
  verbose?: boolean | undefined;
1650
1669
  cwd?: string | undefined;
1651
- filesFormat?: string | undefined;
1652
- workspaceTemplate?: string | undefined;
1653
1670
  healthcheck?: {
1654
1671
  url: string;
1655
1672
  timeoutMs?: number | undefined;
@@ -1658,14 +1675,14 @@ declare const CliTargetConfigSchema: z.ZodObject<{
1658
1675
  cwd?: string | undefined;
1659
1676
  timeoutMs?: number | undefined;
1660
1677
  } | undefined;
1661
- keepTempFiles?: boolean | undefined;
1662
1678
  timeoutMs?: number | undefined;
1679
+ filesFormat?: string | undefined;
1680
+ workspaceTemplate?: string | undefined;
1681
+ keepTempFiles?: boolean | undefined;
1663
1682
  }, {
1664
1683
  command: string;
1665
1684
  verbose?: boolean | undefined;
1666
1685
  cwd?: string | undefined;
1667
- filesFormat?: string | undefined;
1668
- workspaceTemplate?: string | undefined;
1669
1686
  healthcheck?: {
1670
1687
  url: string;
1671
1688
  timeoutMs?: number | undefined;
@@ -1674,8 +1691,10 @@ declare const CliTargetConfigSchema: z.ZodObject<{
1674
1691
  cwd?: string | undefined;
1675
1692
  timeoutMs?: number | undefined;
1676
1693
  } | undefined;
1677
- keepTempFiles?: boolean | undefined;
1678
1694
  timeoutMs?: number | undefined;
1695
+ filesFormat?: string | undefined;
1696
+ workspaceTemplate?: string | undefined;
1697
+ keepTempFiles?: boolean | undefined;
1679
1698
  }>;
1680
1699
  type CliNormalizedConfig = z.infer<typeof CliTargetConfigSchema>;
1681
1700
  /**
@@ -1707,6 +1726,7 @@ interface AzureResolvedConfig {
1707
1726
  readonly deploymentName: string;
1708
1727
  readonly apiKey: string;
1709
1728
  readonly version?: string;
1729
+ readonly apiFormat?: ApiFormat;
1710
1730
  readonly temperature?: number;
1711
1731
  readonly maxOutputTokens?: number;
1712
1732
  readonly retry?: RetryConfig;
@@ -1787,6 +1807,18 @@ interface CopilotSdkResolvedConfig {
1787
1807
  readonly logDir?: string;
1788
1808
  readonly logFormat?: 'summary' | 'json';
1789
1809
  readonly systemPrompt?: string;
1810
+ /** BYOK provider type: "azure", "openai", or "anthropic". */
1811
+ readonly byokType?: string;
1812
+ /** BYOK base URL for the provider endpoint. */
1813
+ readonly byokBaseUrl?: string;
1814
+ /** BYOK API key for authenticating with the provider. */
1815
+ readonly byokApiKey?: string;
1816
+ /** BYOK bearer token (takes precedence over apiKey when set). */
1817
+ readonly byokBearerToken?: string;
1818
+ /** BYOK Azure API version (e.g. "2024-10-21"). Only used when byokType is "azure". */
1819
+ readonly byokApiVersion?: string;
1820
+ /** BYOK wire API format: "completions" or "responses". */
1821
+ readonly byokWireApi?: string;
1790
1822
  }
1791
1823
  interface CopilotLogResolvedConfig {
1792
1824
  /** Explicit path to a session directory containing events.jsonl. */
@@ -1931,15 +1963,20 @@ type ResolvedTarget = (ResolvedTargetBase & {
1931
1963
  }) | (ResolvedTargetBase & {
1932
1964
  readonly kind: 'cli';
1933
1965
  readonly config: CliResolvedConfig;
1966
+ }) | (ResolvedTargetBase & {
1967
+ readonly kind: 'transcript';
1968
+ readonly config: Record<string, never>;
1934
1969
  });
1935
1970
  /**
1936
1971
  * Optional settings accepted on ALL target definitions regardless of provider.
1937
1972
  * Exported so the targets validator can reuse the same list — adding a field
1938
1973
  * here automatically makes it valid in targets.yaml without a separate update.
1939
1974
  */
1940
- declare const COMMON_TARGET_SETTINGS: readonly ["use_target", "provider_batching", "providerBatching", "subagent_mode_allowed", "subagentModeAllowed", "fallback_targets", "fallbackTargets"];
1975
+ declare const COMMON_TARGET_SETTINGS: readonly ["use_target", "provider_batching", "subagent_mode_allowed", "fallback_targets"];
1941
1976
  declare function resolveDelegatedTargetDefinition(name: string, definitions: ReadonlyMap<string, TargetDefinition>, env?: EnvLookup): TargetDefinition | undefined;
1942
- declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string): ResolvedTarget;
1977
+ declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string, options?: {
1978
+ readonly emitDeprecationWarnings?: boolean;
1979
+ }): ResolvedTarget;
1943
1980
 
1944
1981
  /**
1945
1982
  * Extensible provider registry.
@@ -2204,19 +2241,25 @@ interface EvaluatorFactory {
2204
2241
  *
2205
2242
  * Scoring model:
2206
2243
  * score ∈ [0, 1] — continuous quality signal
2207
- * verdict — binary classification derived from score via PASS_THRESHOLD
2244
+ * verdict — binary classification derived from score via threshold
2208
2245
  *
2209
- * score >= PASS_THRESHOLD → 'pass'
2210
- * score < PASS_THRESHOLD → 'fail'
2246
+ * score >= threshold → 'pass'
2247
+ * score < threshold → 'fail'
2211
2248
  * (infrastructure skip) → 'skip'
2212
2249
  *
2213
- * To change the pass/fail boundary, update PASS_THRESHOLD.
2214
- * All verdict derivation flows through scoreToVerdict().
2250
+ * Scoring scale principle:
2251
+ * All user-configurable score thresholds use 0-1 scale.
2252
+ * The only 0-10 values in YAML are `score_ranges` which define LLM integer output band labels.
2253
+ *
2254
+ * Default threshold is 0.8. Override via CLI `--threshold`, suite `execution.threshold`,
2255
+ * or per-test `execution.threshold`. All verdict derivation flows through scoreToVerdict().
2215
2256
  */
2216
2257
 
2217
- /** Score threshold for pass verdict. Scores below this are fail. */
2258
+ /** Default score threshold for pass verdict (0-1). Scores below this are fail. */
2259
+ declare const DEFAULT_THRESHOLD = 0.8;
2260
+ /** @deprecated Use DEFAULT_THRESHOLD instead. */
2218
2261
  declare const PASS_THRESHOLD = 0.8;
2219
- declare function scoreToVerdict(score: number): EvaluationVerdict;
2262
+ declare function scoreToVerdict(score: number, threshold?: number): EvaluationVerdict;
2220
2263
  declare function clampScore(value: number): number;
2221
2264
  declare function extractJsonBlob(text: string): string | undefined;
2222
2265
  declare function parseJsonFromText(text: string): unknown;
@@ -2499,6 +2542,7 @@ declare class LlmGraderEvaluator implements Evaluator {
2499
2542
  private buildScoreRangePrompt;
2500
2543
  private buildRubricPrompt;
2501
2544
  private runWithRetry;
2545
+ private generateStructuredResponse;
2502
2546
  }
2503
2547
  /**
2504
2548
  * Build the mandatory output schema that all evaluators must follow.
@@ -2837,8 +2881,8 @@ interface RunEvaluationOptions {
2837
2881
  readonly cache?: EvaluationCache;
2838
2882
  readonly useCache?: boolean;
2839
2883
  readonly now?: () => Date;
2840
- /** Filter tests by ID pattern (glob supported, e.g., "summary-*") */
2841
- readonly filter?: string;
2884
+ /** Filter tests by ID pattern(s) (glob supported, e.g., "summary-*"). Arrays use OR logic. */
2885
+ readonly filter?: string | readonly string[];
2842
2886
  readonly verbose?: boolean;
2843
2887
  readonly maxConcurrency?: number;
2844
2888
  readonly evalCases?: readonly EvalTest[];
@@ -3008,6 +3052,8 @@ interface EvalAssertionInput {
3008
3052
  readonly weight?: number;
3009
3053
  /** Whether this assertion is required to pass */
3010
3054
  readonly required?: boolean | number;
3055
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
3056
+ readonly min_score?: number;
3011
3057
  /** Prompt file for llm_grader */
3012
3058
  readonly prompt?: string;
3013
3059
  /** Script for code_grader */
@@ -3042,8 +3088,8 @@ interface EvalConfig {
3042
3088
  readonly task?: (input: string) => string | Promise<string>;
3043
3089
  /** Suite-level assertions applied to all tests */
3044
3090
  readonly assert?: readonly AssertEntry[];
3045
- /** Filter tests by ID pattern (glob supported) */
3046
- readonly filter?: string;
3091
+ /** Filter tests by ID pattern(s) (glob supported). Arrays use OR logic. */
3092
+ readonly filter?: string | readonly string[];
3047
3093
  /** Maximum concurrent workers (default: 3) */
3048
3094
  readonly workers?: number;
3049
3095
  /** Maximum retries on failure (default: 2) */
@@ -3056,6 +3102,8 @@ interface EvalConfig {
3056
3102
  readonly verbose?: boolean;
3057
3103
  /** Callback for each completed result */
3058
3104
  readonly onResult?: (result: EvaluationResult) => void;
3105
+ /** Score threshold for pass/fail (0-1). Default: 0.8 (DEFAULT_THRESHOLD). */
3106
+ readonly threshold?: number;
3059
3107
  }
3060
3108
  /**
3061
3109
  * Summary statistics for an evaluation run.
@@ -3063,9 +3111,9 @@ interface EvalConfig {
3063
3111
  interface EvalSummary {
3064
3112
  /** Total number of test cases */
3065
3113
  readonly total: number;
3066
- /** Number of passing test cases (score >= PASS_THRESHOLD) */
3114
+ /** Number of passing test cases (score >= threshold) */
3067
3115
  readonly passed: number;
3068
- /** Number of failing test cases (score < PASS_THRESHOLD) */
3116
+ /** Number of failing test cases (score < threshold) */
3069
3117
  readonly failed: number;
3070
3118
  /** Total duration in milliseconds */
3071
3119
  readonly durationMs: number;
@@ -3505,7 +3553,7 @@ declare class WorkspacePoolManager {
3505
3553
  private removeAllSlots;
3506
3554
  /**
3507
3555
  * Reset an existing slot for reuse:
3508
- * 1. Reset repos (git reset --hard {ref} && git clean -fd per repo)
3556
+ * 1. Reset repos (fetch from origin when resolve=remote, then git reset --hard && git clean per repo)
3509
3557
  * 2. Re-copy template files (skip repo directories)
3510
3558
  */
3511
3559
  private resetSlot;
@@ -3811,15 +3859,21 @@ declare function discoverGraders(registry: EvaluatorRegistry, baseDir: string):
3811
3859
  /**
3812
3860
  * Core types for the transcript import pipeline.
3813
3861
  *
3814
- * A TranscriptEntry represents a single event in a parsed agent session
3815
- * transcript (user message, assistant response, tool call, etc.).
3862
+ * A TranscriptEntry is the internal (camelCase) representation of a parsed
3863
+ * session. A TranscriptJsonLine is the on-disk (snake_case) wire format
3864
+ * written to .agentv/transcripts/*.jsonl files.
3865
+ *
3866
+ * Flow:
3867
+ * raw session JSONL → parser → TranscriptEntry (internal)
3868
+ * TranscriptEntry → toTranscriptJsonLine() → JSONL on disk
3869
+ * JSONL on disk → readTranscriptJsonl() → TranscriptJsonLine[]
3816
3870
  *
3817
- * A TranscriptSource describes where a transcript came from (provider,
3818
- * session ID, file path, etc.).
3871
+ * To add a new importer: write a parser that returns TranscriptEntry,
3872
+ * then use toTranscriptJsonLine() to serialize.
3819
3873
  */
3820
3874
 
3821
3875
  /**
3822
- * A parsed transcript: ordered messages plus session metadata.
3876
+ * A parsed transcript: ordered messages plus session metadata (internal camelCase).
3823
3877
  */
3824
3878
  interface TranscriptEntry {
3825
3879
  readonly messages: Message[];
@@ -3829,7 +3883,7 @@ interface TranscriptEntry {
3829
3883
  readonly costUsd?: number | null;
3830
3884
  }
3831
3885
  /**
3832
- * Metadata describing the origin of a transcript.
3886
+ * Metadata describing the origin of a transcript (internal camelCase).
3833
3887
  */
3834
3888
  interface TranscriptSource {
3835
3889
  readonly provider: string;
@@ -3837,7 +3891,45 @@ interface TranscriptSource {
3837
3891
  readonly projectPath?: string;
3838
3892
  readonly startedAt?: string;
3839
3893
  readonly model?: string;
3894
+ readonly version?: string;
3895
+ readonly gitBranch?: string;
3896
+ readonly cwd?: string;
3897
+ }
3898
+ /**
3899
+ * One line in a transcript JSONL file (snake_case wire format).
3900
+ *
3901
+ * Each line is a self-contained test case with pre-populated output.
3902
+ * The `input` field is the first user message; the `output` field is the
3903
+ * full conversation (Message[]).
3904
+ */
3905
+ interface TranscriptJsonLine {
3906
+ readonly input: string;
3907
+ readonly output: readonly Message[];
3908
+ readonly token_usage?: {
3909
+ readonly input: number;
3910
+ readonly output: number;
3911
+ readonly cached?: number;
3912
+ };
3913
+ readonly duration_ms?: number;
3914
+ readonly cost_usd?: number | null;
3915
+ readonly source: {
3916
+ readonly provider: string;
3917
+ readonly session_id: string;
3918
+ readonly model?: string;
3919
+ readonly timestamp?: string;
3920
+ readonly git_branch?: string;
3921
+ readonly cwd?: string;
3922
+ readonly version?: string;
3923
+ };
3840
3924
  }
3925
+ /**
3926
+ * Convert a parsed TranscriptEntry to the on-disk JSONL wire format.
3927
+ */
3928
+ declare function toTranscriptJsonLine(entry: TranscriptEntry): TranscriptJsonLine;
3929
+ /**
3930
+ * Read a transcript JSONL file and parse each line into a TranscriptJsonLine.
3931
+ */
3932
+ declare function readTranscriptJsonl(filePath: string): Promise<TranscriptJsonLine[]>;
3841
3933
  /**
3842
3934
  * Read a JSONL transcript file and return its raw text.
3843
3935
  * Throws if the file does not exist or cannot be read.
@@ -3871,6 +3963,70 @@ declare function readTranscriptFile(filePath: string): Promise<string>;
3871
3963
 
3872
3964
  declare function parseClaudeSession(jsonl: string): TranscriptEntry;
3873
3965
 
3966
+ /**
3967
+ * Codex CLI session JSONL parser.
3968
+ *
3969
+ * Reads a Codex CLI rollout transcript
3970
+ * (~/.codex/sessions/YYYY/MM/DD/rollout-*.jsonl) and converts it to AgentV's
3971
+ * Message[] format.
3972
+ *
3973
+ * Each line is a JSON object with one of these top-level types:
3974
+ * session_meta → session metadata (id, cwd, cli_version, model)
3975
+ * turn_context → per-turn context (model, cwd, turn_id)
3976
+ * event_msg → events: task_started, task_complete, user_message,
3977
+ * agent_message, token_count
3978
+ * response_item → conversation items: message, function_call,
3979
+ * function_call_output, reasoning, custom_tool_call,
3980
+ * custom_tool_call_output
3981
+ *
3982
+ * Key behaviors:
3983
+ * - response_item with type=message and role=user → user Message
3984
+ * - response_item with type=message and role=assistant → assistant Message
3985
+ * - response_item with type=function_call → ToolCall (pending output)
3986
+ * - response_item with type=function_call_output → matched to pending call by call_id
3987
+ * - response_item with type=reasoning → skipped (thinking tokens)
3988
+ * - response_item with role=developer → skipped (system prompt)
3989
+ * - session_meta → source metadata (session_id, cwd, version, model)
3990
+ * - turn_context → model name extraction
3991
+ * - Duration is from first↔last event timestamp
3992
+ * - cost_usd is null (Codex CLI does not report per-session cost)
3993
+ * - Token usage not available from rollout format (rate limit info only)
3994
+ *
3995
+ * To add a new response_item type: add a case to the switch in parseCodexSession().
3996
+ */
3997
+
3998
+ declare function parseCodexSession(jsonl: string): TranscriptEntry;
3999
+
4000
+ /**
4001
+ * Codex CLI session discovery.
4002
+ *
4003
+ * Scans ~/.codex/sessions/ for rollout JSONL files. Codex CLI stores sessions at:
4004
+ * ~/.codex/sessions/YYYY/MM/DD/rollout-<timestamp>-<uuid>.jsonl
4005
+ *
4006
+ * Sessions are returned sorted by modification time (most recent first).
4007
+ */
4008
+ interface CodexSession {
4009
+ /** UUID from the filename */
4010
+ readonly sessionId: string;
4011
+ /** Full path to the JSONL file */
4012
+ readonly filePath: string;
4013
+ /** Filename (e.g., rollout-2026-03-29T14-22-01-<uuid>.jsonl) */
4014
+ readonly filename: string;
4015
+ /** Last modification time */
4016
+ readonly updatedAt: Date;
4017
+ }
4018
+ interface CodexDiscoverOptions {
4019
+ /** Filter by date string (YYYY-MM-DD). */
4020
+ readonly date?: string;
4021
+ /** Maximum number of sessions to return (default: 10). */
4022
+ readonly limit?: number;
4023
+ /** Override the default ~/.codex/sessions directory. */
4024
+ readonly sessionsDir?: string;
4025
+ /** Return only the most recent session. */
4026
+ readonly latest?: boolean;
4027
+ }
4028
+ declare function discoverCodexSessions(opts?: CodexDiscoverOptions): Promise<CodexSession[]>;
4029
+
3874
4030
  /**
3875
4031
  * Claude Code session discovery.
3876
4032
  *
@@ -3907,9 +4063,80 @@ interface ClaudeDiscoverOptions {
3907
4063
  }
3908
4064
  declare function discoverClaudeSessions(opts?: ClaudeDiscoverOptions): Promise<ClaudeSession[]>;
3909
4065
 
4066
+ /**
4067
+ * Transcript provider — replays pre-recorded session transcripts through the
4068
+ * evaluation pipeline without invoking any live agent.
4069
+ *
4070
+ * Used by `agentv eval --transcript <file>` to grade imported sessions.
4071
+ *
4072
+ * How it works:
4073
+ * 1. Reads a transcript JSONL file (produced by `agentv import`)
4074
+ * 2. Each invocation pops the next line from the transcript
4075
+ * 3. Returns a ProviderResponse with pre-populated output, token usage, etc.
4076
+ * 4. Evaluators run identically to live eval — they see the same ProviderResponse
4077
+ *
4078
+ * The provider name in results is set to the source provider from the transcript
4079
+ * (e.g., "claude", "codex", "copilot").
4080
+ */
4081
+
4082
+ declare class TranscriptProvider implements Provider {
4083
+ readonly id: string;
4084
+ readonly kind: "transcript";
4085
+ readonly targetName: string;
4086
+ private lines;
4087
+ private cursor;
4088
+ constructor(targetName: string, lines: TranscriptJsonLine[]);
4089
+ /**
4090
+ * Create a TranscriptProvider from a JSONL file path.
4091
+ */
4092
+ static fromFile(filePath: string): Promise<TranscriptProvider>;
4093
+ get lineCount(): number;
4094
+ invoke(_request: ProviderRequest): Promise<ProviderResponse>;
4095
+ }
4096
+
4097
+ /**
4098
+ * Copilot CLI events.jsonl parser.
4099
+ *
4100
+ * Reads a Copilot CLI session transcript (events.jsonl) and converts it to
4101
+ * AgentV's Message[] format. Each line is a JSON object with:
4102
+ * { type, data: { ...payload }, id, timestamp, parentId }
4103
+ *
4104
+ * All event-specific fields live under event.data.*, while type, id, timestamp,
4105
+ * and parentId are at the top level.
4106
+ *
4107
+ * Supported event types:
4108
+ * session.start → session metadata (data.sessionId, data.context.cwd)
4109
+ * user.message → Message { role: 'user' }
4110
+ * assistant.message → Message { role: 'assistant', toolCalls from data.toolRequests }
4111
+ * skill.invoked → ToolCall { tool: 'Skill', input: { skill: data.name } }
4112
+ * tool.execution_start + tool.execution_complete → ToolCall with output
4113
+ * session.shutdown → token usage from data.modelMetrics, end timestamp
4114
+ *
4115
+ * To add a new event type:
4116
+ * 1. Add a case to the switch in parseCopilotEvents()
4117
+ * 2. Map it to a Message or ToolCall
4118
+ * 3. Add a test in copilot-log-parser.test.ts
4119
+ */
4120
+
4121
+ interface CopilotSessionMeta {
4122
+ readonly sessionId: string;
4123
+ readonly model: string;
4124
+ readonly cwd: string;
4125
+ readonly repository?: string;
4126
+ readonly branch?: string;
4127
+ readonly startedAt?: string;
4128
+ }
4129
+ interface ParsedCopilotSession {
4130
+ readonly messages: Message[];
4131
+ readonly meta: CopilotSessionMeta;
4132
+ readonly tokenUsage?: ProviderTokenUsage;
4133
+ readonly durationMs?: number;
4134
+ }
4135
+ declare function parseCopilotEvents(eventsJsonl: string): ParsedCopilotSession;
4136
+
3910
4137
  type AgentKernel = {
3911
4138
  status: string;
3912
4139
  };
3913
4140
  declare function createAgentKernel(): AgentKernel;
3914
4141
 
3915
- export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentText, type CopilotCliResolvedConfig, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
4142
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };