npm - @agentv/core - Versions diffs - 4.6.1 → 4.7.0 - Mend

@agentv/core 4.6.1 → 4.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/{chunk-ZK4GG7PR.js → chunk-75RFVESM.js} +215 -127
package/dist/chunk-75RFVESM.js.map +1 -0
package/dist/evaluation/validation/index.cjs +110 -95
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +30 -72
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +1271 -465
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +288 -74
package/dist/index.d.ts +288 -74
package/dist/index.js +1024 -311
package/dist/index.js.map +1 -1
package/package.json +1 -1
package/dist/chunk-ZK4GG7PR.js.map +0 -1

package/dist/index.d.cts CHANGED Viewed

@@ -73,7 +73,7 @@ interface ChatMessage {
     readonly name?: string;
 }
 type ChatPrompt = readonly ChatMessage[];
-type ProviderKind = 'openai' | 'openrouter' | 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-sdk' | 'copilot-cli' | 'copilot-log' | 'pi-coding-agent' | 'pi-cli' | 'claude' | 'claude-cli' | 'claude-sdk' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders' | 'agentv';
+type ProviderKind = 'openai' | 'openrouter' | 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-sdk' | 'copilot-cli' | 'copilot-log' | 'pi-coding-agent' | 'pi-cli' | 'claude' | 'claude-cli' | 'claude-sdk' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders' | 'agentv' | 'transcript';
 /** Callbacks for real-time observability during provider execution */
 interface ProviderStreamCallbacks {
     onToolCallStart?: (toolName: string, toolCallId?: string) => void;
@@ -222,25 +222,19 @@ interface TargetDefinition {
     readonly judge_target?: string | undefined;
     readonly workers?: number | undefined;
     readonly provider_batching?: boolean | undefined;
-    readonly providerBatching?: boolean | undefined;
+    readonly subagent_mode_allowed?: boolean | undefined;
     readonly endpoint?: string | unknown | undefined;
     readonly base_url?: string | unknown | undefined;
-    readonly baseUrl?: string | unknown | undefined;
     readonly resource?: string | unknown | undefined;
-    readonly resourceName?: string | unknown | undefined;
     readonly api_key?: string | unknown | undefined;
-    readonly apiKey?: string | unknown | undefined;
     readonly deployment?: string | unknown | undefined;
-    readonly deploymentName?: string | unknown | undefined;
     readonly model?: string | unknown | undefined;
     readonly version?: string | unknown | undefined;
     readonly api_version?: string | unknown | undefined;
     readonly variant?: string | unknown | undefined;
     readonly thinking_budget?: number | unknown | undefined;
-    readonly thinkingBudget?: number | unknown | undefined;
     readonly temperature?: number | unknown | undefined;
     readonly max_output_tokens?: number | unknown | undefined;
-    readonly maxTokens?: number | unknown | undefined;
     readonly executable?: string | unknown | undefined;
     readonly command?: string | unknown | undefined;
     readonly binary?: string | unknown | undefined;
@@ -248,63 +242,35 @@ interface TargetDefinition {
     readonly arguments?: unknown | undefined;
     readonly cwd?: string | unknown | undefined;
     readonly timeout_seconds?: number | unknown | undefined;
-    readonly timeoutSeconds?: number | unknown | undefined;
     readonly log_dir?: string | unknown | undefined;
-    readonly logDir?: string | unknown | undefined;
     readonly log_directory?: string | unknown | undefined;
-    readonly logDirectory?: string | unknown | undefined;
     readonly log_format?: string | unknown | undefined;
-    readonly logFormat?: string | unknown | undefined;
     readonly log_output_format?: string | unknown | undefined;
-    readonly logOutputFormat?: string | unknown | undefined;
     readonly system_prompt?: string | unknown | undefined;
-    readonly systemPrompt?: string | unknown | undefined;
     readonly max_turns?: number | unknown | undefined;
-    readonly maxTurns?: number | unknown | undefined;
     readonly max_budget_usd?: number | unknown | undefined;
-    readonly maxBudgetUsd?: number | unknown | undefined;
     readonly response?: string | unknown | undefined;
-    readonly delayMs?: number | unknown | undefined;
-    readonly delayMinMs?: number | unknown | undefined;
-    readonly delayMaxMs?: number | unknown | undefined;
     readonly wait?: boolean | unknown | undefined;
     readonly dry_run?: boolean | unknown | undefined;
-    readonly dryRun?: boolean | unknown | undefined;
     readonly subagent_root?: string | unknown | undefined;
-    readonly subagentRoot?: string | unknown | undefined;
     readonly workspace_template?: string | unknown | undefined;
-    readonly workspaceTemplate?: string | unknown | undefined;
     readonly files_format?: string | unknown | undefined;
-    readonly filesFormat?: string | unknown | undefined;
     readonly attachments_format?: string | unknown | undefined;
-    readonly attachmentsFormat?: string | unknown | undefined;
     readonly env?: unknown | undefined;
     readonly healthcheck?: unknown | undefined;
     readonly session_dir?: string | unknown | undefined;
-    readonly sessionDir?: string | unknown | undefined;
     readonly session_id?: string | unknown | undefined;
-    readonly sessionId?: string | unknown | undefined;
     readonly discover?: string | unknown | undefined;
     readonly session_state_dir?: string | unknown | undefined;
-    readonly sessionStateDir?: string | unknown | undefined;
     readonly cli_url?: string | unknown | undefined;
-    readonly cliUrl?: string | unknown | undefined;
     readonly cli_path?: string | unknown | undefined;
-    readonly cliPath?: string | unknown | undefined;
     readonly github_token?: string | unknown | undefined;
-    readonly githubToken?: string | unknown | undefined;
     readonly max_retries?: number | unknown | undefined;
-    readonly maxRetries?: number | unknown | undefined;
     readonly retry_initial_delay_ms?: number | unknown | undefined;
-    readonly retryInitialDelayMs?: number | unknown | undefined;
     readonly retry_max_delay_ms?: number | unknown | undefined;
-    readonly retryMaxDelayMs?: number | unknown | undefined;
     readonly retry_backoff_factor?: number | unknown | undefined;
-    readonly retryBackoffFactor?: number | unknown | undefined;
     readonly retry_status_codes?: unknown | undefined;
-    readonly retryStatusCodes?: unknown | undefined;
     readonly fallback_targets?: readonly string[] | unknown | undefined;
-    readonly fallbackTargets?: readonly string[] | unknown | undefined;
 }
 /**
@@ -375,6 +341,8 @@ interface ToolTrajectoryEvaluatorConfig {
     /** Optional weight for top-level aggregation (defaults to 1.0) */
     readonly weight?: number;
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
     /** Default argument matching mode for all expected items (defaults to 'exact') */
@@ -667,6 +635,8 @@ type CodeEvaluatorConfig = {
     readonly resolvedCwd?: string;
     readonly weight?: number;
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
     /** Pass-through configuration for the code-grader (any unrecognized YAML properties) */
@@ -699,6 +669,8 @@ type LlmGraderEvaluatorConfig = {
     readonly rubrics?: readonly RubricItem[];
     readonly weight?: number;
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
     /** Optional target override for this grader (uses a named LLM target from targets.yaml). */
@@ -737,13 +709,17 @@ type RubricItem = {
     readonly outcome?: string;
     readonly weight: number;
     /**
-     * Legacy boolean gating (deprecated, treated as required_min_score: 10).
-     * Use required_min_score instead for finer control.
+     * Legacy boolean gating (treated as min_score: 1.0 for score-range rubrics).
      */
     readonly required?: boolean;
     /**
-     * Minimum score (0-10) required to pass this criterion.
-     * If the criterion score is below this threshold, the overall verdict is 'fail'.
+     * Minimum score (0-1 scale) required to pass this criterion.
+     * Internally compared against normalized score (rawScore / 10).
+     */
+    readonly min_score?: number;
+    /**
+     * @deprecated Use min_score (0-1 scale) instead.
+     * Legacy: minimum score on 0-10 integer scale.
      */
     readonly required_min_score?: number;
     /**
@@ -776,6 +752,8 @@ type CompositeEvaluatorConfig = {
     readonly aggregator: CompositeAggregatorConfig;
     readonly weight?: number;
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
@@ -820,6 +798,8 @@ type FieldAccuracyEvaluatorConfig = {
     readonly aggregation?: FieldAggregationType;
     readonly weight?: number;
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
@@ -834,6 +814,8 @@ type LatencyEvaluatorConfig = {
     readonly threshold: number;
     readonly weight?: number;
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
@@ -848,6 +830,8 @@ type CostEvaluatorConfig = {
     readonly budget: number;
     readonly weight?: number;
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
@@ -866,6 +850,8 @@ type TokenUsageEvaluatorConfig = {
     readonly max_output?: number;
     readonly weight?: number;
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
@@ -893,6 +879,8 @@ type ExecutionMetricsEvaluatorConfig = {
     readonly exploration_tolerance?: number;
     readonly weight?: number;
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
@@ -906,6 +894,8 @@ type ContainsEvaluatorConfig = {
     readonly value: string;
     readonly weight?: number;
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
@@ -919,6 +909,8 @@ type ContainsAnyEvaluatorConfig = {
     readonly value: readonly string[];
     readonly weight?: number;
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
@@ -932,6 +924,8 @@ type ContainsAllEvaluatorConfig = {
     readonly value: readonly string[];
     readonly weight?: number;
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
@@ -945,6 +939,8 @@ type IcontainsEvaluatorConfig = {
     readonly value: string;
     readonly weight?: number;
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
@@ -958,6 +954,8 @@ type IcontainsAnyEvaluatorConfig = {
     readonly value: readonly string[];
     readonly weight?: number;
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
@@ -971,6 +969,8 @@ type IcontainsAllEvaluatorConfig = {
     readonly value: readonly string[];
     readonly weight?: number;
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
@@ -984,6 +984,8 @@ type StartsWithEvaluatorConfig = {
     readonly value: string;
     readonly weight?: number;
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
@@ -997,6 +999,8 @@ type EndsWithEvaluatorConfig = {
     readonly value: string;
     readonly weight?: number;
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
@@ -1012,6 +1016,8 @@ type RegexEvaluatorConfig = {
     readonly flags?: string;
     readonly weight?: number;
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
@@ -1024,6 +1030,8 @@ type IsJsonEvaluatorConfig = {
     readonly type: 'is-json';
     readonly weight?: number;
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
@@ -1037,6 +1045,8 @@ type EqualsEvaluatorConfig = {
     readonly value: string;
     readonly weight?: number;
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
@@ -1050,6 +1060,8 @@ type RubricsEvaluatorConfig = {
     readonly criteria: readonly RubricItem[];
     readonly weight?: number;
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
@@ -1068,6 +1080,8 @@ type SkillTriggerEvaluatorConfig = {
     readonly should_trigger?: boolean;
     readonly weight?: number;
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     readonly negate?: boolean;
 };
 /**
@@ -1079,6 +1093,8 @@ type InlineAssertEvaluatorConfig = {
     readonly type: 'inline-assert';
     readonly weight?: number;
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     readonly negate?: boolean;
 };
 type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | SkillTriggerEvaluatorConfig | ContainsEvaluatorConfig | ContainsAnyEvaluatorConfig | ContainsAllEvaluatorConfig | IcontainsEvaluatorConfig | IcontainsAnyEvaluatorConfig | IcontainsAllEvaluatorConfig | StartsWithEvaluatorConfig | EndsWithEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig | InlineAssertEvaluatorConfig;
@@ -1087,7 +1103,7 @@ type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | Composit
  */
 interface EvalTest {
     readonly id: string;
-    readonly dataset?: string;
+    readonly suite?: string;
     readonly category?: string;
     readonly conversation_id?: string;
     readonly question: string;
@@ -1104,6 +1120,8 @@ interface EvalTest {
     readonly metadata?: Record<string, unknown>;
     /** Per-test target override (matrix evaluation) */
     readonly targets?: readonly string[];
+    /** Per-test score threshold override (0-1). Resolution: CLI > test > suite > DEFAULT_THRESHOLD. */
+    readonly threshold?: number;
 }
 /** @deprecated Use `EvalTest` instead */
 type EvalCase = EvalTest;
@@ -1197,7 +1215,7 @@ type FailOnError = boolean;
 interface EvaluationResult {
     readonly timestamp: string;
     readonly testId: string;
-    readonly dataset?: string;
+    readonly suite?: string;
     readonly category?: string;
     readonly conversationId?: string;
     readonly score: number;
@@ -1427,8 +1445,8 @@ declare function detectFormat(filePath: string): 'yaml' | 'jsonl' | 'agent-skill
 type LoadOptions = {
     readonly verbose?: boolean;
-    /** Filter tests by ID pattern (glob supported, e.g., "summary-*") */
-    readonly filter?: string;
+    /** Filter tests by ID pattern(s) (glob supported, e.g., "summary-*"). Arrays use OR logic. */
+    readonly filter?: string | readonly string[];
     /** Category derived from the eval file's directory path */
     readonly category?: string;
 };
@@ -1599,7 +1617,7 @@ declare function resolveFileReference(rawValue: string, searchRoots: readonly st
 /**
  * Strict normalized schema for CLI target configuration.
  * This is the final validated shape after environment variable resolution
- * and snake_case to camelCase normalization.
+ * and internal field normalization.
  *
  * Uses .strict() to reject unknown properties, ensuring configuration
  * errors are caught early rather than silently ignored.
@@ -1648,8 +1666,6 @@ declare const CliTargetConfigSchema: z.ZodObject<{
     command: string;
     verbose?: boolean | undefined;
     cwd?: string | undefined;
-    filesFormat?: string | undefined;
-    workspaceTemplate?: string | undefined;
     healthcheck?: {
         url: string;
         timeoutMs?: number | undefined;
@@ -1658,14 +1674,14 @@ declare const CliTargetConfigSchema: z.ZodObject<{
         cwd?: string | undefined;
         timeoutMs?: number | undefined;
     } | undefined;
-    keepTempFiles?: boolean | undefined;
     timeoutMs?: number | undefined;
+    filesFormat?: string | undefined;
+    workspaceTemplate?: string | undefined;
+    keepTempFiles?: boolean | undefined;
 }, {
     command: string;
     verbose?: boolean | undefined;
     cwd?: string | undefined;
-    filesFormat?: string | undefined;
-    workspaceTemplate?: string | undefined;
     healthcheck?: {
         url: string;
         timeoutMs?: number | undefined;
@@ -1674,8 +1690,10 @@ declare const CliTargetConfigSchema: z.ZodObject<{
         cwd?: string | undefined;
         timeoutMs?: number | undefined;
     } | undefined;
-    keepTempFiles?: boolean | undefined;
     timeoutMs?: number | undefined;
+    filesFormat?: string | undefined;
+    workspaceTemplate?: string | undefined;
+    keepTempFiles?: boolean | undefined;
 }>;
 type CliNormalizedConfig = z.infer<typeof CliTargetConfigSchema>;
 /**
@@ -1707,6 +1725,7 @@ interface AzureResolvedConfig {
     readonly deploymentName: string;
     readonly apiKey: string;
     readonly version?: string;
+    readonly apiFormat?: ApiFormat;
     readonly temperature?: number;
     readonly maxOutputTokens?: number;
     readonly retry?: RetryConfig;
@@ -1931,15 +1950,20 @@ type ResolvedTarget = (ResolvedTargetBase & {
 }) | (ResolvedTargetBase & {
     readonly kind: 'cli';
     readonly config: CliResolvedConfig;
+}) | (ResolvedTargetBase & {
+    readonly kind: 'transcript';
+    readonly config: Record<string, never>;
 });
 /**
  * Optional settings accepted on ALL target definitions regardless of provider.
  * Exported so the targets validator can reuse the same list — adding a field
  * here automatically makes it valid in targets.yaml without a separate update.
  */
-declare const COMMON_TARGET_SETTINGS: readonly ["use_target", "provider_batching", "providerBatching", "subagent_mode_allowed", "subagentModeAllowed", "fallback_targets", "fallbackTargets"];
+declare const COMMON_TARGET_SETTINGS: readonly ["use_target", "provider_batching", "subagent_mode_allowed", "fallback_targets"];
 declare function resolveDelegatedTargetDefinition(name: string, definitions: ReadonlyMap<string, TargetDefinition>, env?: EnvLookup): TargetDefinition | undefined;
-declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string): ResolvedTarget;
+declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string, options?: {
+    readonly emitDeprecationWarnings?: boolean;
+}): ResolvedTarget;
 /**
  * Extensible provider registry.
@@ -2204,19 +2228,25 @@ interface EvaluatorFactory {
  *
  * Scoring model:
  *   score  ∈ [0, 1]  — continuous quality signal
- *   verdict           — binary classification derived from score via PASS_THRESHOLD
+ *   verdict           — binary classification derived from score via threshold
  *
- *   score >= PASS_THRESHOLD  →  'pass'
- *   score <  PASS_THRESHOLD  →  'fail'
+ *   score >= threshold  →  'pass'
+ *   score <  threshold  →  'fail'
  *   (infrastructure skip)    →  'skip'
  *
- * To change the pass/fail boundary, update PASS_THRESHOLD.
- * All verdict derivation flows through scoreToVerdict().
+ * Scoring scale principle:
+ *   All user-configurable score thresholds use 0-1 scale.
+ *   The only 0-10 values in YAML are `score_ranges` which define LLM integer output band labels.
+ *
+ * Default threshold is 0.8. Override via CLI `--threshold`, suite `execution.threshold`,
+ * or per-test `execution.threshold`. All verdict derivation flows through scoreToVerdict().
  */
-/** Score threshold for pass verdict. Scores below this are fail. */
+/** Default score threshold for pass verdict (0-1). Scores below this are fail. */
+declare const DEFAULT_THRESHOLD = 0.8;
+/** @deprecated Use DEFAULT_THRESHOLD instead. */
 declare const PASS_THRESHOLD = 0.8;
-declare function scoreToVerdict(score: number): EvaluationVerdict;
+declare function scoreToVerdict(score: number, threshold?: number): EvaluationVerdict;
 declare function clampScore(value: number): number;
 declare function extractJsonBlob(text: string): string | undefined;
 declare function parseJsonFromText(text: string): unknown;
@@ -2499,6 +2529,7 @@ declare class LlmGraderEvaluator implements Evaluator {
     private buildScoreRangePrompt;
     private buildRubricPrompt;
     private runWithRetry;
+    private generateStructuredResponse;
 }
 /**
  * Build the mandatory output schema that all evaluators must follow.
@@ -2837,8 +2868,8 @@ interface RunEvaluationOptions {
     readonly cache?: EvaluationCache;
     readonly useCache?: boolean;
     readonly now?: () => Date;
-    /** Filter tests by ID pattern (glob supported, e.g., "summary-*") */
-    readonly filter?: string;
+    /** Filter tests by ID pattern(s) (glob supported, e.g., "summary-*"). Arrays use OR logic. */
+    readonly filter?: string | readonly string[];
     readonly verbose?: boolean;
     readonly maxConcurrency?: number;
     readonly evalCases?: readonly EvalTest[];
@@ -3008,6 +3039,8 @@ interface EvalAssertionInput {
     readonly weight?: number;
     /** Whether this assertion is required to pass */
     readonly required?: boolean | number;
+    /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
+    readonly min_score?: number;
     /** Prompt file for llm_grader */
     readonly prompt?: string;
     /** Script for code_grader */
@@ -3042,8 +3075,8 @@ interface EvalConfig {
     readonly task?: (input: string) => string | Promise<string>;
     /** Suite-level assertions applied to all tests */
     readonly assert?: readonly AssertEntry[];
-    /** Filter tests by ID pattern (glob supported) */
-    readonly filter?: string;
+    /** Filter tests by ID pattern(s) (glob supported). Arrays use OR logic. */
+    readonly filter?: string | readonly string[];
     /** Maximum concurrent workers (default: 3) */
     readonly workers?: number;
     /** Maximum retries on failure (default: 2) */
@@ -3056,6 +3089,8 @@ interface EvalConfig {
     readonly verbose?: boolean;
     /** Callback for each completed result */
     readonly onResult?: (result: EvaluationResult) => void;
+    /** Score threshold for pass/fail (0-1). Default: 0.8 (DEFAULT_THRESHOLD). */
+    readonly threshold?: number;
 }
 /**
  * Summary statistics for an evaluation run.
@@ -3063,9 +3098,9 @@ interface EvalConfig {
 interface EvalSummary {
     /** Total number of test cases */
     readonly total: number;
-    /** Number of passing test cases (score >= PASS_THRESHOLD) */
+    /** Number of passing test cases (score >= threshold) */
     readonly passed: number;
-    /** Number of failing test cases (score < PASS_THRESHOLD) */
+    /** Number of failing test cases (score < threshold) */
     readonly failed: number;
     /** Total duration in milliseconds */
     readonly durationMs: number;
@@ -3505,7 +3540,7 @@ declare class WorkspacePoolManager {
     private removeAllSlots;
     /**
      * Reset an existing slot for reuse:
-     * 1. Reset repos (git reset --hard {ref} && git clean -fd per repo)
+     * 1. Reset repos (fetch from origin when resolve=remote, then git reset --hard && git clean per repo)
      * 2. Re-copy template files (skip repo directories)
      */
     private resetSlot;
@@ -3811,15 +3846,21 @@ declare function discoverGraders(registry: EvaluatorRegistry, baseDir: string):
 /**
  * Core types for the transcript import pipeline.
  *
- * A TranscriptEntry represents a single event in a parsed agent session
- * transcript (user message, assistant response, tool call, etc.).
+ * A TranscriptEntry is the internal (camelCase) representation of a parsed
+ * session. A TranscriptJsonLine is the on-disk (snake_case) wire format
+ * written to .agentv/transcripts/*.jsonl files.
+ *
+ * Flow:
+ *   raw session JSONL → parser → TranscriptEntry (internal)
+ *   TranscriptEntry → toTranscriptJsonLine() → JSONL on disk
+ *   JSONL on disk → readTranscriptJsonl() → TranscriptJsonLine[]
  *
- * A TranscriptSource describes where a transcript came from (provider,
- * session ID, file path, etc.).
+ * To add a new importer: write a parser that returns TranscriptEntry,
+ * then use toTranscriptJsonLine() to serialize.
  */
 /**
- * A parsed transcript: ordered messages plus session metadata.
+ * A parsed transcript: ordered messages plus session metadata (internal camelCase).
  */
 interface TranscriptEntry {
     readonly messages: Message[];
@@ -3829,7 +3870,7 @@ interface TranscriptEntry {
     readonly costUsd?: number | null;
 }
 /**
- * Metadata describing the origin of a transcript.
+ * Metadata describing the origin of a transcript (internal camelCase).
  */
 interface TranscriptSource {
     readonly provider: string;
@@ -3837,7 +3878,45 @@ interface TranscriptSource {
     readonly projectPath?: string;
     readonly startedAt?: string;
     readonly model?: string;
+    readonly version?: string;
+    readonly gitBranch?: string;
+    readonly cwd?: string;
+}
+/**
+ * One line in a transcript JSONL file (snake_case wire format).
+ *
+ * Each line is a self-contained test case with pre-populated output.
+ * The `input` field is the first user message; the `output` field is the
+ * full conversation (Message[]).
+ */
+interface TranscriptJsonLine {
+    readonly input: string;
+    readonly output: readonly Message[];
+    readonly token_usage?: {
+        readonly input: number;
+        readonly output: number;
+        readonly cached?: number;
+    };
+    readonly duration_ms?: number;
+    readonly cost_usd?: number | null;
+    readonly source: {
+        readonly provider: string;
+        readonly session_id: string;
+        readonly model?: string;
+        readonly timestamp?: string;
+        readonly git_branch?: string;
+        readonly cwd?: string;
+        readonly version?: string;
+    };
 }
+/**
+ * Convert a parsed TranscriptEntry to the on-disk JSONL wire format.
+ */
+declare function toTranscriptJsonLine(entry: TranscriptEntry): TranscriptJsonLine;
+/**
+ * Read a transcript JSONL file and parse each line into a TranscriptJsonLine.
+ */
+declare function readTranscriptJsonl(filePath: string): Promise<TranscriptJsonLine[]>;
 /**
  * Read a JSONL transcript file and return its raw text.
  * Throws if the file does not exist or cannot be read.
@@ -3871,6 +3950,70 @@ declare function readTranscriptFile(filePath: string): Promise<string>;
 declare function parseClaudeSession(jsonl: string): TranscriptEntry;
+/**
+ * Codex CLI session JSONL parser.
+ *
+ * Reads a Codex CLI rollout transcript
+ * (~/.codex/sessions/YYYY/MM/DD/rollout-*.jsonl) and converts it to AgentV's
+ * Message[] format.
+ *
+ * Each line is a JSON object with one of these top-level types:
+ *   session_meta   → session metadata (id, cwd, cli_version, model)
+ *   turn_context   → per-turn context (model, cwd, turn_id)
+ *   event_msg      → events: task_started, task_complete, user_message,
+ *                     agent_message, token_count
+ *   response_item  → conversation items: message, function_call,
+ *                     function_call_output, reasoning, custom_tool_call,
+ *                     custom_tool_call_output
+ *
+ * Key behaviors:
+ *   - response_item with type=message and role=user → user Message
+ *   - response_item with type=message and role=assistant → assistant Message
+ *   - response_item with type=function_call → ToolCall (pending output)
+ *   - response_item with type=function_call_output → matched to pending call by call_id
+ *   - response_item with type=reasoning → skipped (thinking tokens)
+ *   - response_item with role=developer → skipped (system prompt)
+ *   - session_meta → source metadata (session_id, cwd, version, model)
+ *   - turn_context → model name extraction
+ *   - Duration is from first↔last event timestamp
+ *   - cost_usd is null (Codex CLI does not report per-session cost)
+ *   - Token usage not available from rollout format (rate limit info only)
+ *
+ * To add a new response_item type: add a case to the switch in parseCodexSession().
+ */
+declare function parseCodexSession(jsonl: string): TranscriptEntry;
+/**
+ * Codex CLI session discovery.
+ *
+ * Scans ~/.codex/sessions/ for rollout JSONL files. Codex CLI stores sessions at:
+ *   ~/.codex/sessions/YYYY/MM/DD/rollout-<timestamp>-<uuid>.jsonl
+ *
+ * Sessions are returned sorted by modification time (most recent first).
+ */
+interface CodexSession {
+    /** UUID from the filename */
+    readonly sessionId: string;
+    /** Full path to the JSONL file */
+    readonly filePath: string;
+    /** Filename (e.g., rollout-2026-03-29T14-22-01-<uuid>.jsonl) */
+    readonly filename: string;
+    /** Last modification time */
+    readonly updatedAt: Date;
+}
+interface CodexDiscoverOptions {
+    /** Filter by date string (YYYY-MM-DD). */
+    readonly date?: string;
+    /** Maximum number of sessions to return (default: 10). */
+    readonly limit?: number;
+    /** Override the default ~/.codex/sessions directory. */
+    readonly sessionsDir?: string;
+    /** Return only the most recent session. */
+    readonly latest?: boolean;
+}
+declare function discoverCodexSessions(opts?: CodexDiscoverOptions): Promise<CodexSession[]>;
 /**
  * Claude Code session discovery.
  *
@@ -3907,9 +4050,80 @@ interface ClaudeDiscoverOptions {
 }
 declare function discoverClaudeSessions(opts?: ClaudeDiscoverOptions): Promise<ClaudeSession[]>;
+/**
+ * Transcript provider — replays pre-recorded session transcripts through the
+ * evaluation pipeline without invoking any live agent.
+ *
+ * Used by `agentv eval --transcript <file>` to grade imported sessions.
+ *
+ * How it works:
+ *   1. Reads a transcript JSONL file (produced by `agentv import`)
+ *   2. Each invocation pops the next line from the transcript
+ *   3. Returns a ProviderResponse with pre-populated output, token usage, etc.
+ *   4. Evaluators run identically to live eval — they see the same ProviderResponse
+ *
+ * The provider name in results is set to the source provider from the transcript
+ * (e.g., "claude", "codex", "copilot").
+ */
+declare class TranscriptProvider implements Provider {
+    readonly id: string;
+    readonly kind: "transcript";
+    readonly targetName: string;
+    private lines;
+    private cursor;
+    constructor(targetName: string, lines: TranscriptJsonLine[]);
+    /**
+     * Create a TranscriptProvider from a JSONL file path.
+     */
+    static fromFile(filePath: string): Promise<TranscriptProvider>;
+    get lineCount(): number;
+    invoke(_request: ProviderRequest): Promise<ProviderResponse>;
+}
+/**
+ * Copilot CLI events.jsonl parser.
+ *
+ * Reads a Copilot CLI session transcript (events.jsonl) and converts it to
+ * AgentV's Message[] format. Each line is a JSON object with:
+ *   { type, data: { ...payload }, id, timestamp, parentId }
+ *
+ * All event-specific fields live under event.data.*, while type, id, timestamp,
+ * and parentId are at the top level.
+ *
+ * Supported event types:
+ *   session.start    → session metadata (data.sessionId, data.context.cwd)
+ *   user.message     → Message { role: 'user' }
+ *   assistant.message → Message { role: 'assistant', toolCalls from data.toolRequests }
+ *   skill.invoked    → ToolCall { tool: 'Skill', input: { skill: data.name } }
+ *   tool.execution_start + tool.execution_complete → ToolCall with output
+ *   session.shutdown → token usage from data.modelMetrics, end timestamp
+ *
+ * To add a new event type:
+ *   1. Add a case to the switch in parseCopilotEvents()
+ *   2. Map it to a Message or ToolCall
+ *   3. Add a test in copilot-log-parser.test.ts
+ */
+interface CopilotSessionMeta {
+    readonly sessionId: string;
+    readonly model: string;
+    readonly cwd: string;
+    readonly repository?: string;
+    readonly branch?: string;
+    readonly startedAt?: string;
+}
+interface ParsedCopilotSession {
+    readonly messages: Message[];
+    readonly meta: CopilotSessionMeta;
+    readonly tokenUsage?: ProviderTokenUsage;
+    readonly durationMs?: number;
+}
+declare function parseCopilotEvents(eventsJsonl: string): ParsedCopilotSession;
 type AgentKernel = {
     status: string;
 };
 declare function createAgentKernel(): AgentKernel;
-export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentText, type CopilotCliResolvedConfig, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
+export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };