npm - @agentv/core - Versions diffs - 4.17.1-next.1 → 4.18.0-next.1 - Mend

@agentv/core 4.17.1-next.1 → 4.18.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/{chunk-6VZY3B6M.js → chunk-PYDBJOAO.js} +6 -6
package/dist/chunk-PYDBJOAO.js.map +1 -0
package/dist/evaluation/validation/index.cjs +5 -5
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +3 -3
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +229 -238
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +156 -158
package/dist/index.d.ts +156 -158
package/dist/index.js +210 -216
package/dist/index.js.map +1 -1
package/package.json +1 -1
package/dist/chunk-6VZY3B6M.js.map +0 -1

package/dist/index.d.ts CHANGED Viewed

@@ -341,7 +341,7 @@ type ArgsMatchMode = 'exact' | 'ignore' | 'subset' | 'superset';
 /**
  * Configuration for tool-trajectory evaluator.
  */
-interface ToolTrajectoryEvaluatorConfig {
+interface ToolTrajectoryGraderConfig {
     readonly name: string;
     readonly type: 'tool-trajectory';
     /** Matching mode */
@@ -355,7 +355,7 @@ interface ToolTrajectoryEvaluatorConfig {
     readonly required?: boolean | number;
     /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
     readonly min_score?: number;
-    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
     /** Default argument matching mode for all expected items (defaults to 'exact') */
     readonly argsMatch?: ArgsMatchMode | readonly string[];
@@ -539,9 +539,9 @@ declare function isJsonValue(value: unknown): value is JsonValue;
  * - Either content (string or array of objects) OR tool_calls (for assistant messages)
  */
 declare function isTestMessage(value: unknown): value is TestMessage;
-declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
-type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
-declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
+declare const GRADER_KIND_VALUES: readonly ["code-grader", "llm-grader", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
+type GraderKind = (typeof GRADER_KIND_VALUES)[number];
+declare function isGraderKind(value: unknown): value is GraderKind;
 /**
  * Configuration for enabling target access in code-grader evaluators.
  * When present, the runtime will start a local proxy server that allows
@@ -697,7 +697,7 @@ type WorkspaceConfig = {
      *  relative paths from their own directory, not the eval file's directory. */
     readonly workspaceFileDir?: string;
 };
-type CodeEvaluatorConfig = {
+type CodeGraderConfig = {
     readonly name: string;
     readonly type: 'code-grader';
     readonly command: readonly string[];
@@ -710,7 +710,7 @@ type CodeEvaluatorConfig = {
     readonly required?: boolean | number;
     /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
     readonly min_score?: number;
-    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
     /** Pass-through configuration for the code-grader (any unrecognized YAML properties) */
     readonly config?: JsonObject;
@@ -739,7 +739,7 @@ type ContentPreprocessorConfig = {
     /** Resolved absolute path for the command script (last argv element) */
     readonly resolvedCommand?: readonly string[];
 };
-type LlmGraderEvaluatorConfig = {
+type LlmGraderConfig = {
     readonly name: string;
     readonly type: 'llm-grader';
     /** Text prompt (inline or file path) or executable script config */
@@ -754,7 +754,7 @@ type LlmGraderEvaluatorConfig = {
     readonly required?: boolean | number;
     /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
     readonly min_score?: number;
-    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
     /** Optional target override for this grader (uses a named LLM target from targets.yaml). */
     readonly target?: string;
@@ -767,8 +767,6 @@ type LlmGraderEvaluatorConfig = {
     /** Optional content preprocessors for ContentFile blocks in assistant output */
     readonly preprocessors?: readonly ContentPreprocessorConfig[];
 };
-/** @deprecated Use `LlmGraderEvaluatorConfig` instead */
-type LlmJudgeEvaluatorConfig = LlmGraderEvaluatorConfig;
 /**
  * Score range definition for analytic rubric scoring.
  * Each range maps an integer score band (0-10) to an outcome description.
@@ -830,16 +828,16 @@ type CompositeAggregatorConfig = {
     readonly type: 'threshold';
     readonly threshold: number;
 };
-type CompositeEvaluatorConfig = {
+type CompositeGraderConfig = {
     readonly name: string;
     readonly type: 'composite';
-    readonly assertions: readonly EvaluatorConfig[];
+    readonly assertions: readonly GraderConfig[];
     readonly aggregator: CompositeAggregatorConfig;
     readonly weight?: number;
     readonly required?: boolean | number;
     /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
     readonly min_score?: number;
-    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
 /**
@@ -874,7 +872,7 @@ type FieldConfig = {
 /**
  * Configuration for the field-accuracy evaluator.
  */
-type FieldAccuracyEvaluatorConfig = {
+type FieldAccuracyGraderConfig = {
     readonly name: string;
     readonly type: 'field-accuracy';
     /** Fields to compare between candidate and expected */
@@ -885,14 +883,14 @@ type FieldAccuracyEvaluatorConfig = {
     readonly required?: boolean | number;
     /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
     readonly min_score?: number;
-    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
 /**
  * Configuration for the latency evaluator.
  * Checks execution duration against a threshold.
  */
-type LatencyEvaluatorConfig = {
+type LatencyGraderConfig = {
     readonly name: string;
     readonly type: 'latency';
     /** Maximum allowed duration in milliseconds */
@@ -901,14 +899,14 @@ type LatencyEvaluatorConfig = {
     readonly required?: boolean | number;
     /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
     readonly min_score?: number;
-    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
 /**
  * Configuration for the cost evaluator.
  * Checks execution cost against a budget.
  */
-type CostEvaluatorConfig = {
+type CostGraderConfig = {
     readonly name: string;
     readonly type: 'cost';
     /** Maximum allowed cost in USD */
@@ -917,14 +915,14 @@ type CostEvaluatorConfig = {
     readonly required?: boolean | number;
     /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
     readonly min_score?: number;
-    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
 /**
  * Configuration for the token-usage evaluator.
  * Checks provider-reported token usage against configured limits.
  */
-type TokenUsageEvaluatorConfig = {
+type TokenUsageGraderConfig = {
     readonly name: string;
     readonly type: 'token-usage';
     /** Maximum allowed total tokens (input + output + cached, when present) */
@@ -937,7 +935,7 @@ type TokenUsageEvaluatorConfig = {
     readonly required?: boolean | number;
     /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
     readonly min_score?: number;
-    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
 /**
@@ -945,7 +943,7 @@ type TokenUsageEvaluatorConfig = {
  * Provides declarative threshold-based checks on execution metrics.
  * Only specified thresholds are checked; omitted ones are ignored.
  */
-type ExecutionMetricsEvaluatorConfig = {
+type ExecutionMetricsGraderConfig = {
     readonly name: string;
     readonly type: 'execution-metrics';
     /** Maximum allowed number of tool calls */
@@ -966,14 +964,14 @@ type ExecutionMetricsEvaluatorConfig = {
     readonly required?: boolean | number;
     /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
     readonly min_score?: number;
-    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
 /**
  * Configuration for the contains assertion evaluator.
  * Checks whether the candidate output contains a specified substring.
  */
-type ContainsEvaluatorConfig = {
+type ContainsGraderConfig = {
     readonly name: string;
     readonly type: 'contains';
     readonly value: string;
@@ -981,14 +979,14 @@ type ContainsEvaluatorConfig = {
     readonly required?: boolean | number;
     /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
     readonly min_score?: number;
-    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
 /**
  * Configuration for the contains_any assertion evaluator.
  * Checks whether the candidate output contains ANY of the specified substrings.
  */
-type ContainsAnyEvaluatorConfig = {
+type ContainsAnyGraderConfig = {
     readonly name: string;
     readonly type: 'contains-any';
     readonly value: readonly string[];
@@ -996,14 +994,14 @@ type ContainsAnyEvaluatorConfig = {
     readonly required?: boolean | number;
     /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
     readonly min_score?: number;
-    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
 /**
  * Configuration for the contains_all assertion evaluator.
  * Checks whether the candidate output contains ALL of the specified substrings.
  */
-type ContainsAllEvaluatorConfig = {
+type ContainsAllGraderConfig = {
     readonly name: string;
     readonly type: 'contains-all';
     readonly value: readonly string[];
@@ -1011,14 +1009,14 @@ type ContainsAllEvaluatorConfig = {
     readonly required?: boolean | number;
     /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
     readonly min_score?: number;
-    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
 /**
  * Configuration for the icontains assertion evaluator.
  * Case-insensitive check whether the candidate output contains a specified substring.
  */
-type IcontainsEvaluatorConfig = {
+type IcontainsGraderConfig = {
     readonly name: string;
     readonly type: 'icontains';
     readonly value: string;
@@ -1026,14 +1024,14 @@ type IcontainsEvaluatorConfig = {
     readonly required?: boolean | number;
     /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
     readonly min_score?: number;
-    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
 /**
  * Configuration for the icontains_any assertion evaluator.
  * Case-insensitive check whether the candidate output contains ANY of the specified substrings.
  */
-type IcontainsAnyEvaluatorConfig = {
+type IcontainsAnyGraderConfig = {
     readonly name: string;
     readonly type: 'icontains-any';
     readonly value: readonly string[];
@@ -1041,14 +1039,14 @@ type IcontainsAnyEvaluatorConfig = {
     readonly required?: boolean | number;
     /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
     readonly min_score?: number;
-    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
 /**
  * Configuration for the icontains_all assertion evaluator.
  * Case-insensitive check whether the candidate output contains ALL of the specified substrings.
  */
-type IcontainsAllEvaluatorConfig = {
+type IcontainsAllGraderConfig = {
     readonly name: string;
     readonly type: 'icontains-all';
     readonly value: readonly string[];
@@ -1056,14 +1054,14 @@ type IcontainsAllEvaluatorConfig = {
     readonly required?: boolean | number;
     /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
     readonly min_score?: number;
-    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
 /**
  * Configuration for the starts_with assertion evaluator.
  * Checks whether the candidate output starts with a specified string (both trimmed).
  */
-type StartsWithEvaluatorConfig = {
+type StartsWithGraderConfig = {
     readonly name: string;
     readonly type: 'starts-with';
     readonly value: string;
@@ -1071,14 +1069,14 @@ type StartsWithEvaluatorConfig = {
     readonly required?: boolean | number;
     /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
     readonly min_score?: number;
-    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
 /**
  * Configuration for the ends_with assertion evaluator.
  * Checks whether the candidate output ends with a specified string (both trimmed).
  */
-type EndsWithEvaluatorConfig = {
+type EndsWithGraderConfig = {
     readonly name: string;
     readonly type: 'ends-with';
     readonly value: string;
@@ -1086,14 +1084,14 @@ type EndsWithEvaluatorConfig = {
     readonly required?: boolean | number;
     /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
     readonly min_score?: number;
-    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
 /**
  * Configuration for the regex assertion evaluator.
  * Checks whether the candidate output matches a regular expression pattern.
  */
-type RegexEvaluatorConfig = {
+type RegexGraderConfig = {
     readonly name: string;
     readonly type: 'regex';
     readonly value: string;
@@ -1103,28 +1101,28 @@ type RegexEvaluatorConfig = {
     readonly required?: boolean | number;
     /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
     readonly min_score?: number;
-    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
 /**
  * Configuration for the is_json assertion evaluator.
  * Checks whether the candidate output is valid JSON.
  */
-type IsJsonEvaluatorConfig = {
+type IsJsonGraderConfig = {
     readonly name: string;
     readonly type: 'is-json';
     readonly weight?: number;
     readonly required?: boolean | number;
     /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
     readonly min_score?: number;
-    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
 /**
  * Configuration for the equals assertion evaluator.
  * Checks whether the candidate output exactly equals a specified string.
  */
-type EqualsEvaluatorConfig = {
+type EqualsGraderConfig = {
     readonly name: string;
     readonly type: 'equals';
     readonly value: string;
@@ -1132,7 +1130,7 @@ type EqualsEvaluatorConfig = {
     readonly required?: boolean | number;
     /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
     readonly min_score?: number;
-    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
 /**
@@ -1147,7 +1145,7 @@ type RubricsEvaluatorConfig = {
     readonly required?: boolean | number;
     /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
     readonly min_score?: number;
-    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
+    /** When true, inverts the grader score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
 /**
@@ -1156,7 +1154,7 @@ type RubricsEvaluatorConfig = {
  * Tool-name resolution is automatic based on the provider kind.
  * For providers not covered by the built-in mapping, use a code-grader.
  */
-type SkillTriggerEvaluatorConfig = {
+type SkillTriggerGraderConfig = {
     readonly name: string;
     readonly type: 'skill-trigger';
     /** The skill name to check for (case-sensitive substring match) */
@@ -1182,7 +1180,7 @@ type InlineAssertEvaluatorConfig = {
     readonly min_score?: number;
     readonly negate?: boolean;
 };
-type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | SkillTriggerEvaluatorConfig | ContainsEvaluatorConfig | ContainsAnyEvaluatorConfig | ContainsAllEvaluatorConfig | IcontainsEvaluatorConfig | IcontainsAnyEvaluatorConfig | IcontainsAllEvaluatorConfig | StartsWithEvaluatorConfig | EndsWithEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig | InlineAssertEvaluatorConfig;
+type GraderConfig = CodeGraderConfig | LlmGraderConfig | CompositeGraderConfig | ToolTrajectoryGraderConfig | FieldAccuracyGraderConfig | LatencyGraderConfig | CostGraderConfig | TokenUsageGraderConfig | ExecutionMetricsGraderConfig | SkillTriggerGraderConfig | ContainsGraderConfig | ContainsAnyGraderConfig | ContainsAllGraderConfig | IcontainsGraderConfig | IcontainsAnyGraderConfig | IcontainsAllGraderConfig | StartsWithGraderConfig | EndsWithGraderConfig | RegexGraderConfig | IsJsonGraderConfig | EqualsGraderConfig | RubricsEvaluatorConfig | InlineAssertEvaluatorConfig;
 /**
  * A single turn in a multi-turn conversation evaluation.
  * Each turn is a user message. The runner generates the assistant response.
@@ -1193,7 +1191,7 @@ interface ConversationTurn {
     /** Reference assistant response for grading (NOT carried forward — actual LLM response is used) */
     readonly expected_output?: TestMessageContent;
     /** Per-turn assertions. Strings become rubric criteria via shorthand. */
-    readonly assertions?: readonly (string | EvaluatorConfig)[];
+    readonly assertions?: readonly (string | GraderConfig)[];
 }
 /**
  * Conversation evaluation mode.
@@ -1228,8 +1226,8 @@ interface EvalTest {
     readonly reference_answer?: string;
     readonly file_paths: readonly string[];
     readonly criteria: string;
-    readonly evaluator?: EvaluatorKind;
-    readonly assertions?: readonly EvaluatorConfig[];
+    readonly evaluator?: GraderKind;
+    readonly assertions?: readonly GraderConfig[];
     /** Suite-level preprocessors used by the implicit default llm-grader. */
     readonly preprocessors?: readonly ContentPreprocessorConfig[];
     /** Workspace configuration (merged from suite-level and case-level) */
@@ -1293,7 +1291,7 @@ interface TrialResult {
     readonly attempt: number;
     readonly score: number;
     readonly verdict: EvaluationVerdict;
-    readonly scores?: readonly EvaluatorResult[];
+    readonly scores?: readonly GraderResult[];
     readonly error?: string;
     readonly costUsd?: number;
     /** Primary classification for this trial attempt */
@@ -1359,7 +1357,7 @@ interface ExecutionError {
  */
 type FailOnError = boolean;
 /**
- * Evaluator scorecard for a single eval case run.
+ * Grader scorecard for a single eval case run.
  */
 interface EvaluationResult {
     readonly timestamp: string;
@@ -1390,7 +1388,7 @@ interface EvaluationResult {
         readonly lm?: JsonObject;
         readonly evaluator?: JsonObject;
     };
-    readonly scores?: readonly EvaluatorResult[];
+    readonly scores?: readonly GraderResult[];
     readonly error?: string;
     /** Lightweight summary of the execution trace (always included when available) */
     readonly trace?: TraceSummary;
@@ -1433,9 +1431,9 @@ interface EvaluationResult {
     readonly executionError?: ExecutionError;
 }
 type EvaluationVerdict = 'pass' | 'fail' | 'skip';
-interface EvaluatorResult {
+interface GraderResult {
     readonly name: string;
-    readonly type: EvaluatorKind;
+    readonly type: GraderKind;
     readonly score: number;
     readonly weight?: number;
     readonly verdict?: EvaluationVerdict;
@@ -1444,7 +1442,7 @@ interface EvaluatorResult {
     readonly input?: JsonObject;
     /** Target name used for grading (e.g., the LLM provider name). */
     readonly target?: string;
-    readonly scores?: readonly EvaluatorResult[];
+    readonly scores?: readonly GraderResult[];
     /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts). */
     readonly details?: JsonObject;
     /** Token usage from LLM calls made by this evaluator (optional). */
@@ -1642,7 +1640,7 @@ type EvalSuiteResult = {
     /** Suite-level metadata (name, description, version, etc.) */
     readonly metadata?: EvalMetadata;
     /** Suite-level total cost budget in USD */
-    readonly totalBudgetUsd?: number;
+    readonly budgetUsd?: number;
     /** Execution error tolerance: true or false */
     readonly failOnError?: FailOnError;
     /** Suite-level quality threshold (0-1) — suite fails if mean score is below */
@@ -2346,8 +2344,8 @@ interface EvaluationContext {
     readonly graderProvider?: Provider;
     /** @deprecated Use `graderProvider` instead */
     readonly judgeProvider?: Provider;
-    readonly evaluatorTemplateOverride?: string;
-    readonly evaluator?: EvaluatorConfig;
+    readonly graderTemplateOverride?: string;
+    readonly evaluator?: GraderConfig;
     /** Output messages from agent execution (primary source for tool trajectory) */
     readonly output?: readonly Message[];
     /** Lightweight summary of trace events (if available) */
@@ -2380,8 +2378,8 @@ interface EvaluationScore {
     readonly verdict: EvaluationVerdict;
     readonly assertions: readonly AssertionEntry[];
     readonly expectedAspectCount: number;
-    readonly evaluatorRawRequest?: JsonObject;
-    readonly scores?: readonly ChildEvaluatorResult[];
+    readonly graderRawRequest?: JsonObject;
+    readonly scores?: readonly ChildGraderResult[];
     /** Optional structured details from evaluators (e.g., TP/TN/FP/FN counts, alignments, per-turn scores). */
     readonly details?: JsonObject;
     /** Token usage from LLM calls made by this evaluator (optional). */
@@ -2389,26 +2387,26 @@ interface EvaluationScore {
     /** Target name used for grading (e.g., the LLM provider). */
     readonly graderTarget?: string;
 }
-interface ChildEvaluatorResult {
+interface ChildGraderResult {
     readonly name: string;
     readonly type: string;
     readonly score: number;
     readonly weight?: number;
     readonly verdict: EvaluationVerdict;
     readonly assertions: readonly AssertionEntry[];
-    readonly evaluatorRawRequest?: JsonObject;
-    readonly scores?: readonly ChildEvaluatorResult[];
+    readonly graderRawRequest?: JsonObject;
+    readonly scores?: readonly ChildGraderResult[];
     /** Optional structured details from evaluators (e.g., TP/TN/FP/FN counts, alignments, per-turn scores). */
     readonly details?: JsonObject;
     /** Token usage from LLM calls made by this evaluator (optional). */
     readonly tokenUsage?: TokenUsage;
 }
-interface Evaluator {
+interface Grader {
     readonly kind: string;
     evaluate(context: EvaluationContext): Promise<EvaluationScore> | EvaluationScore;
 }
-interface EvaluatorFactory {
-    create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
+interface GraderFactory {
+    create(config: GraderConfig, context: EvaluationContext): Grader;
 }
 /**
@@ -2447,7 +2445,7 @@ declare function deepEqual(a: unknown, b: unknown): boolean;
  */
 declare function negateScore(score: EvaluationScore): EvaluationScore;
-interface CodeEvaluatorOptions {
+interface CodeGraderOptions {
     readonly command: readonly string[];
     /** @deprecated Use `command` instead */
     readonly script?: readonly string[];
@@ -2458,29 +2456,29 @@ interface CodeEvaluatorOptions {
     /** Target access config - when present, enables target invocation */
     readonly target?: TargetAccessConfig;
 }
-declare class CodeEvaluator implements Evaluator {
+declare class CodeGrader implements Grader {
     readonly kind = "code-grader";
     private readonly command;
     private readonly cwd?;
     private readonly agentTimeoutMs?;
     private readonly config?;
     private readonly target?;
-    constructor(options: CodeEvaluatorOptions);
+    constructor(options: CodeGraderOptions);
     evaluate(context: EvaluationContext): Promise<EvaluationScore>;
 }
 declare function executeScript(scriptPath: readonly string[] | string, input: string, agentTimeoutMs?: number, cwd?: string, env?: Record<string, string>): Promise<string>;
-interface CompositeEvaluatorOptions {
-    readonly config: CompositeEvaluatorConfig;
-    readonly evaluatorFactory: EvaluatorFactory;
+interface CompositeGraderOptions {
+    readonly config: CompositeGraderConfig;
+    readonly evaluatorFactory: GraderFactory;
     readonly cwd?: string;
 }
-declare class CompositeEvaluator implements Evaluator {
+declare class CompositeGrader implements Grader {
     readonly kind = "composite";
     private readonly config;
     private readonly evaluatorFactory;
     private readonly cwd?;
-    constructor(options: CompositeEvaluatorOptions);
+    constructor(options: CompositeGraderOptions);
     evaluate(context: EvaluationContext): Promise<EvaluationScore>;
     private aggregate;
     private runWeightedAverage;
@@ -2489,50 +2487,50 @@ declare class CompositeEvaluator implements Evaluator {
     private runLlmAggregator;
 }
-interface CostEvaluatorOptions {
-    readonly config: CostEvaluatorConfig;
+interface CostGraderOptions {
+    readonly config: CostGraderConfig;
 }
 /**
- * Evaluator that checks execution cost against a budget.
+ * Grader that checks execution cost against a budget.
  * Uses costUsd from the evaluation context.
  */
-declare class CostEvaluator implements Evaluator {
+declare class CostGrader implements Grader {
     readonly kind = "cost";
     private readonly config;
-    constructor(options: CostEvaluatorOptions);
+    constructor(options: CostGraderOptions);
     evaluate(context: EvaluationContext): EvaluationScore;
 }
-interface ExecutionMetricsEvaluatorOptions {
-    readonly config: ExecutionMetricsEvaluatorConfig;
+interface ExecutionMetricsGraderOptions {
+    readonly config: ExecutionMetricsGraderConfig;
 }
 /**
- * Evaluator that checks execution metrics against configured thresholds.
+ * Grader that checks execution metrics against configured thresholds.
  * Supports multiple threshold types: tool calls, LLM calls, tokens, cost, duration,
  * and exploration ratio. Only specified thresholds are checked.
  *
  * Score is proportional: passed / total assertions
  */
-declare class ExecutionMetricsEvaluator implements Evaluator {
+declare class ExecutionMetricsGrader implements Grader {
     readonly kind = "execution-metrics";
     private readonly config;
-    constructor(options: ExecutionMetricsEvaluatorOptions);
+    constructor(options: ExecutionMetricsGraderOptions);
     evaluate(context: EvaluationContext): EvaluationScore;
     private extractConfiguredThresholds;
     private filterDefinedMetrics;
 }
-interface FieldAccuracyEvaluatorOptions {
-    readonly config: FieldAccuracyEvaluatorConfig;
+interface FieldAccuracyGraderOptions {
+    readonly config: FieldAccuracyGraderConfig;
 }
 /**
- * FieldAccuracyEvaluator compares extracted structured data against expected values
+ * FieldAccuracyGrader compares extracted structured data against expected values
  * with configurable matching strategies (exact, fuzzy, numeric_tolerance, date).
  */
-declare class FieldAccuracyEvaluator implements Evaluator {
+declare class FieldAccuracyGrader implements Grader {
     readonly kind = "field-accuracy";
     private readonly config;
-    constructor(options: FieldAccuracyEvaluatorOptions);
+    constructor(options: FieldAccuracyGraderOptions);
     evaluate(context: EvaluationContext): EvaluationScore;
     /**
      * Extract expected data from expected_output array.
@@ -2561,33 +2559,33 @@ declare class FieldAccuracyEvaluator implements Evaluator {
     private aggregateResults;
 }
-interface LatencyEvaluatorOptions {
-    readonly config: LatencyEvaluatorConfig;
+interface LatencyGraderOptions {
+    readonly config: LatencyGraderConfig;
 }
 /**
- * Evaluator that checks execution duration against a threshold.
+ * Grader that checks execution duration against a threshold.
  * Uses durationMs from the evaluation context.
  */
-declare class LatencyEvaluator implements Evaluator {
+declare class LatencyGrader implements Grader {
     readonly kind = "latency";
     private readonly config;
-    constructor(options: LatencyEvaluatorOptions);
+    constructor(options: LatencyGraderOptions);
     evaluate(context: EvaluationContext): EvaluationScore;
 }
 /**
- * Default evaluator template for the user prompt (variables will be substituted).
- * Custom evaluators can override this via evaluatorTemplate option.
+ * Default grader template for the user prompt (variables will be substituted).
+ * Custom graders can override this via graderTemplate option.
  */
-declare const DEFAULT_EVALUATOR_TEMPLATE: string;
+declare const DEFAULT_GRADER_TEMPLATE: string;
 type GraderProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
-interface LlmGraderEvaluatorOptions {
+interface LlmGraderOptions {
     readonly resolveGraderProvider: GraderProviderResolver;
     /** @deprecated Use `resolveGraderProvider` instead. */
     readonly resolveJudgeProvider?: GraderProviderResolver;
     readonly maxOutputTokens?: number;
     readonly temperature?: number;
-    readonly evaluatorTemplate?: string;
+    readonly graderTemplate?: string;
     readonly maxSteps?: number;
     readonly graderTargetProvider?: Provider;
     /** @deprecated Use `graderTargetProvider` instead. */
@@ -2657,15 +2655,15 @@ declare const rubricEvaluationSchema: z.ZodObject<{
     overall_reasoning: string;
 }>;
-declare class LlmGraderEvaluator implements Evaluator {
+declare class LlmGrader implements Grader {
     readonly kind = "llm-grader";
     private readonly resolveGraderProvider;
     private readonly maxOutputTokens?;
     private readonly temperature?;
-    private readonly evaluatorTemplate?;
+    private readonly graderTemplate?;
     private readonly maxSteps;
     private readonly graderTargetProvider?;
-    constructor(options: LlmGraderEvaluatorOptions);
+    constructor(options: LlmGraderOptions);
     evaluate(context: EvaluationContext): Promise<EvaluationScore>;
     private prepareContext;
     private evaluateFreeform;
@@ -2722,7 +2720,7 @@ declare class LlmGraderEvaluator implements Evaluator {
 }
 /**
  * Build the mandatory output schema that all evaluators must follow.
- * This schema is always appended to the evaluator template.
+ * This schema is always appended to the grader template.
  */
 declare function buildOutputSchema(): string;
 declare function buildRubricOutputSchema(): string;
@@ -2766,10 +2764,10 @@ declare function extractImageBlocks(messages: readonly Message[]): ContentImage[
  *   names (input.skill, input.file_path) regardless of provider.
  */
-declare class SkillTriggerEvaluator implements Evaluator {
+declare class SkillTriggerGrader implements Grader {
     readonly kind = "skill-trigger";
     private readonly config;
-    constructor(config: SkillTriggerEvaluatorConfig);
+    constructor(config: SkillTriggerGraderConfig);
     evaluate(context: EvaluationContext): EvaluationScore;
 }
@@ -2783,33 +2781,33 @@ declare function assembleLlmGraderPrompt(input: {
     evalCase: EvalTest;
     candidate: string;
     promptInputs: PromptInputs;
-    evaluatorConfig?: LlmGraderEvaluatorConfig;
+    evaluatorConfig?: LlmGraderConfig;
     output?: readonly Message[];
     fileChanges?: string;
-    evaluatorTemplateOverride?: string;
+    graderTemplateOverride?: string;
 }): LlmGraderPromptAssembly;
-interface TokenUsageEvaluatorOptions {
-    readonly config: TokenUsageEvaluatorConfig;
+interface TokenUsageGraderOptions {
+    readonly config: TokenUsageGraderConfig;
 }
 /**
- * Evaluator that checks provider-reported token usage against configured limits.
+ * Grader that checks provider-reported token usage against configured limits.
  * Uses tokenUsage from the evaluation context.
  */
-declare class TokenUsageEvaluator implements Evaluator {
+declare class TokenUsageGrader implements Grader {
     readonly kind = "token-usage";
     private readonly config;
-    constructor(options: TokenUsageEvaluatorOptions);
+    constructor(options: TokenUsageGraderOptions);
     evaluate(context: EvaluationContext): EvaluationScore;
 }
-interface ToolTrajectoryEvaluatorOptions {
-    readonly config: ToolTrajectoryEvaluatorConfig;
+interface ToolTrajectoryGraderOptions {
+    readonly config: ToolTrajectoryGraderConfig;
 }
-declare class ToolTrajectoryEvaluator implements Evaluator {
+declare class ToolTrajectoryGrader implements Grader {
     readonly kind = "tool-trajectory";
     private readonly config;
-    constructor(options: ToolTrajectoryEvaluatorOptions);
+    constructor(options: ToolTrajectoryGraderOptions);
     evaluate(context: EvaluationContext): EvaluationScore;
     /**
      * Extract tool calls from output messages.
@@ -2873,7 +2871,7 @@ declare function runIsJsonAssertion(output: string): AssertionResult;
 declare function runEqualsAssertion(output: string, value: string): AssertionResult;
 /**
- * Extensible evaluator registry.
+ * Extensible grader registry.
  *
  * Replaces the hardcoded switch/case dispatch in the orchestrator with
  * a registry of named factory functions. Built-in evaluators are registered
@@ -2882,10 +2880,10 @@ declare function runEqualsAssertion(output: string, value: string): AssertionRes
  */
 /**
- * Context passed to evaluator factory functions during creation.
+ * Context passed to grader factory functions during creation.
  * Contains shared resources needed by evaluator instances.
  */
-interface EvaluatorDispatchContext {
+interface GraderDispatchContext {
     /** Shared LLM grader provider (resolved at suite level) */
     readonly graderProvider?: Provider;
     /** @deprecated Use `graderProvider` instead */
@@ -2899,48 +2897,48 @@ interface EvaluatorDispatchContext {
     /** Directory containing the eval file (for composite member resolution) */
     readonly evalFileDir?: string;
     /** Shared LLM grader evaluator instance */
-    readonly llmGrader: Evaluator;
+    readonly llmGrader: Grader;
     /** @deprecated Use `llmGrader` instead */
-    readonly llmJudge?: Evaluator;
+    readonly llmJudge?: Grader;
     /** Reference to the registry itself (for composite evaluators that need to create children) */
-    readonly registry: EvaluatorRegistry;
+    readonly registry: GraderRegistry;
 }
 /**
- * Factory function that creates an Evaluator instance from a config.
+ * Factory function that creates an Grader instance from a config.
  *
  * Factory functions handle all type-specific initialization logic:
  * - Reading prompt files for LLM graders
  * - Resolving script paths for code graders
  * - Creating adapter evaluators for deterministic assertions
  */
-type EvaluatorFactoryFn = (config: EvaluatorConfig, context: EvaluatorDispatchContext) => Evaluator | Promise<Evaluator>;
+type GraderFactoryFn = (config: GraderConfig, context: GraderDispatchContext) => Grader | Promise<Grader>;
 /**
- * Registry of evaluator factory functions keyed by evaluator type name.
+ * Registry of grader factory functions keyed by grader type name.
  *
  * Built-in evaluators are registered at startup. Custom evaluators can be
  * registered via the `register()` method or discovered from `.agentv/assertions/`.
  */
-declare class EvaluatorRegistry {
+declare class GraderRegistry {
     private readonly factories;
-    /** Register a factory function for an evaluator type. */
-    register(type: string, factory: EvaluatorFactoryFn): this;
-    /** Get the factory function for an evaluator type. */
-    get(type: string): EvaluatorFactoryFn | undefined;
+    /** Register a factory function for an grader type. */
+    register(type: string, factory: GraderFactoryFn): this;
+    /** Get the factory function for an grader type. */
+    get(type: string): GraderFactoryFn | undefined;
     /** Check if a factory is registered for the given type. */
     has(type: string): boolean;
-    /** List all registered evaluator type names. */
+    /** List all registered grader type names. */
     list(): string[];
     /**
      * Create an evaluator instance from a config, using the registered factory.
-     * Throws if no factory is registered for the evaluator type.
+     * Throws if no factory is registered for the grader type.
      */
-    create(config: EvaluatorConfig, context: EvaluatorDispatchContext): Promise<Evaluator>;
+    create(config: GraderConfig, context: GraderDispatchContext): Promise<Grader>;
 }
 /**
- * Adapter that wraps a synchronous assertion function as an Evaluator.
+ * Adapter that wraps a synchronous assertion function as an Grader.
  * Used for deterministic assertions (contains, regex, is-json, equals).
  */
-declare class DeterministicAssertionEvaluator implements Evaluator {
+declare class DeterministicAssertionGrader implements Grader {
     private readonly assertFn;
     readonly kind: string;
     constructor(kind: string, assertFn: (context: EvaluationContext) => EvaluationScore);
@@ -2988,8 +2986,8 @@ interface RunEvalCaseOptions {
     readonly evalCase: EvalTest;
     readonly provider: Provider;
     readonly target: ResolvedTarget;
-    readonly evaluators: Partial<Record<string, Evaluator>> & {
-        readonly 'llm-grader': Evaluator;
+    readonly evaluators: Partial<Record<string, Grader>> & {
+        readonly 'llm-grader': Grader;
     };
     readonly now?: () => Date;
     readonly maxRetries?: number;
@@ -3020,8 +3018,8 @@ interface RunEvalCaseOptions {
     readonly suiteWorkspaceFile?: string;
     /** Real-time observability callbacks passed to the provider */
     readonly streamCallbacks?: ProviderStreamCallbacks;
-    /** Evaluator type registry (with custom assertions discovered) */
-    readonly typeRegistry?: EvaluatorRegistry;
+    /** Grader type registry (with custom assertions discovered) */
+    readonly typeRegistry?: GraderRegistry;
     /** RepoManager instance for repo lifecycle (shared workspace mode) */
     readonly repoManager?: RepoManager;
     /** Directory containing the eval YAML file. Used as default cwd for workspace scripts. */
@@ -3054,7 +3052,7 @@ interface RunEvaluationOptions {
     readonly targets?: readonly TargetDefinition[];
     readonly env?: EnvLookup;
     readonly providerFactory?: (target: ResolvedTarget) => Provider;
-    readonly evaluators?: Partial<Record<string, Evaluator>>;
+    readonly evaluators?: Partial<Record<string, Grader>>;
     readonly maxRetries?: number;
     readonly agentTimeoutMs?: number;
     readonly cache?: EvaluationCache;
@@ -3076,7 +3074,7 @@ interface RunEvaluationOptions {
     /** Real-time observability callbacks passed to the provider */
     readonly streamCallbacks?: ProviderStreamCallbacks;
     /** Suite-level total cost budget in USD (stops dispatching when exceeded) */
-    readonly totalBudgetUsd?: number;
+    readonly budgetUsd?: number;
     /** Execution error tolerance: true halts on first error */
     readonly failOnError?: FailOnError;
     /** Workspace pooling: true (default) enables pool, false disables, undefined defaults to true */
@@ -3111,7 +3109,7 @@ declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationRes
  * Types for inline assertion functions used in the evaluate() API.
  *
  * Inline functions are the escape hatch for custom evaluation logic
- * that doesn't fit a built-in evaluator type. For built-in assertions
+ * that doesn't fit a built-in grader type. For built-in assertions
  * (contains, regex, is-json, etc.), use config objects instead:
  *
  *   assert: [{ type: 'contains', value: 'hello' }]
@@ -4186,17 +4184,17 @@ declare class OtlpJsonFileExporter {
 }
 /**
- * Factory functions for all built-in evaluator types.
+ * Factory functions for all built-in grader types.
  *
- * Each factory creates an Evaluator instance from an EvaluatorConfig,
+ * Each factory creates an Grader instance from an GraderConfig,
  * handling type-specific initialization logic. These are registered into
- * the EvaluatorRegistry at startup.
+ * the GraderRegistry at startup.
  */
 /**
- * Create a new EvaluatorRegistry with all built-in evaluator types registered.
+ * Create a new GraderRegistry with all built-in grader types registered.
  */
-declare function createBuiltinRegistry(): EvaluatorRegistry;
+declare function createBuiltinRegistry(): GraderRegistry;
 /**
  * Convention-based discovery of custom assertion scripts.
@@ -4216,27 +4214,27 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
  * @param baseDir - The base directory to search from (typically project root or eval file dir)
  * @returns Names of discovered assertion types
  */
-declare function discoverAssertions(registry: EvaluatorRegistry, baseDir: string): Promise<string[]>;
+declare function discoverAssertions(registry: GraderRegistry, baseDir: string): Promise<string[]>;
 /**
  * Convention-based discovery of custom grader scripts.
  *
  * Scans `.agentv/graders/` (and legacy `.agentv/judges/`) for TypeScript/JavaScript
- * files and registers them as code-grader evaluators in the registry. The file name
- * (without extension) becomes the evaluator type name.
+ * files and registers them as code graders in the registry. The file name
+ * (without extension) becomes the grader type name.
  *
  * Example: `.agentv/graders/custom-grader.ts` → type "custom-grader" in EVAL.yaml
  */
 /**
  * Discover custom grader scripts from `.agentv/graders/` (and legacy `.agentv/judges/`)
- * and register them as evaluator types in the registry.
+ * and register them as grader types in the registry.
  *
- * @param registry - The evaluator registry to register discovered graders into
+ * @param registry - The grader registry to register discovered graders into
  * @param baseDir - The base directory to search from (typically project root or eval file dir)
  * @returns Names of discovered grader types
  */
-declare function discoverGraders(registry: EvaluatorRegistry, baseDir: string): Promise<string[]>;
+declare function discoverGraders(registry: GraderRegistry, baseDir: string): Promise<string[]>;
 /**
  * Core types for the transcript import pipeline.
@@ -4489,7 +4487,7 @@ declare function discoverClaudeSessions(opts?: ClaudeDiscoverOptions): Promise<C
  *   1. Reads a transcript JSONL file (produced by `agentv import`)
  *   2. Each invocation pops the next line from the transcript
  *   3. Returns a ProviderResponse with pre-populated output, token usage, etc.
- *   4. Evaluators run identically to live eval — they see the same ProviderResponse
+ *   4. Graders run identically to live eval — they see the same ProviderResponse
  *
  * The provider name in results is set to the source provider from the transcript
  * (e.g., "claude", "codex", "copilot").
@@ -4555,4 +4553,4 @@ type AgentKernel = {
 };
 declare function createAgentKernel(): AgentKernel;
-export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
+export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };