npm - @agentv/core - Versions diffs - 2.19.0 → 3.0.0-next.1 - Mend

@agentv/core 2.19.0 → 3.0.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/dist/agentv-provider-5CJVBBGG.js +7 -0
package/dist/agentv-provider-5CJVBBGG.js.map +1 -0
package/dist/{chunk-ACTIPQZ3.js → chunk-CASGWWOU.js} +56 -20
package/dist/chunk-CASGWWOU.js.map +1 -0
package/dist/chunk-XBGLLO22.js +65 -0
package/dist/chunk-XBGLLO22.js.map +1 -0
package/dist/evaluation/validation/index.cjs +31 -14
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +19 -10
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +4690 -3406
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +362 -137
package/dist/index.d.ts +362 -137
package/dist/index.js +7316 -6147
package/dist/index.js.map +1 -1
package/package.json +3 -2
package/dist/chunk-ACTIPQZ3.js.map +0 -1

package/dist/index.d.ts CHANGED Viewed

@@ -8,7 +8,7 @@ interface ChatMessage {
     readonly name?: string;
 }
 type ChatPrompt = readonly ChatMessage[];
-type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-sdk' | 'copilot-cli' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
+type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-sdk' | 'copilot-cli' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude' | 'claude-cli' | 'claude-sdk' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders' | 'agentv';
 /** Callbacks for real-time observability during provider execution */
 interface ProviderStreamCallbacks {
     onToolCallStart?: (toolName: string, toolCallId?: string) => void;
@@ -144,6 +144,8 @@ type EnvLookup = Readonly<Record<string, string | undefined>>;
 interface TargetDefinition {
     readonly name: string;
     readonly provider: ProviderKind | string;
+    readonly grader_target?: string | undefined;
+    /** @deprecated Use `grader_target` instead */
     readonly judge_target?: string | undefined;
     readonly workers?: number | undefined;
     readonly provider_batching?: boolean | undefined;
@@ -463,11 +465,11 @@ declare function isJsonValue(value: unknown): value is JsonValue;
  * - Either content (string or array of objects) OR tool_calls (for assistant messages)
  */
 declare function isTestMessage(value: unknown): value is TestMessage;
-declare const EVALUATOR_KIND_VALUES: readonly ["code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "agent-judge", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics"];
+declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
 type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
 declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
 /**
- * Configuration for enabling target access in code-judge evaluators.
+ * Configuration for enabling target access in code-grader evaluators.
  * When present, the runtime will start a local proxy server that allows
  * the script to invoke configured targets without direct credential access.
  */
@@ -566,7 +568,7 @@ type WorkspaceConfig = {
 };
 type CodeEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'code-judge';
+    readonly type: 'code-judge' | 'code-grader';
     readonly command: readonly string[];
     /** @deprecated Use `command` instead */
     readonly script?: readonly string[];
@@ -577,14 +579,14 @@ type CodeEvaluatorConfig = {
     readonly required?: boolean | number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
-    /** Pass-through configuration for the code-judge (any unrecognized YAML properties) */
+    /** Pass-through configuration for the code-grader (any unrecognized YAML properties) */
     readonly config?: JsonObject;
     /** When present, enables target access via local proxy */
     readonly target?: TargetAccessConfig;
 };
 /**
  * Executable prompt template configuration.
- * Matches code-judge pattern for consistency.
+ * Matches code-grader pattern for consistency.
  */
 type PromptScriptConfig = {
     /** Command array to execute (e.g., ["bun", "run", "template.ts"]) */
@@ -594,24 +596,32 @@ type PromptScriptConfig = {
     /** Pass-through configuration for the prompt template */
     readonly config?: Record<string, unknown>;
 };
-type LlmJudgeEvaluatorConfig = {
+type LlmGraderEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'llm-judge';
+    readonly type: 'llm-grader' | 'llm-judge';
     /** Text prompt (inline or file path) or executable script config */
     readonly prompt?: string | PromptScriptConfig;
     readonly promptPath?: string;
     /** Resolved absolute path for prompt file (used for text template prompts) */
     readonly resolvedPromptPath?: string;
-    /** Resolved script array for executable prompts (matches code-judge pattern) */
+    /** Resolved script array for executable prompts (matches code-grader pattern) */
     readonly resolvedPromptScript?: readonly string[];
     readonly rubrics?: readonly RubricItem[];
     readonly weight?: number;
     readonly required?: boolean | number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
+    /** Optional target override for this grader (uses a named LLM target from targets.yaml). */
+    readonly target?: string;
     /** Pass-through configuration for custom evaluator prompts (legacy, prefer prompt.config) */
     readonly config?: Record<string, unknown>;
+    /** Maximum agent steps for agentv built-in mode (default 10, max 50). Ignored in LLM mode. */
+    readonly max_steps?: number;
+    /** Temperature override for grader calls */
+    readonly temperature?: number;
 };
+/** @deprecated Use `LlmGraderEvaluatorConfig` instead */
+type LlmJudgeEvaluatorConfig = LlmGraderEvaluatorConfig;
 /**
  * Score range definition for analytic rubric scoring.
  * Each range maps an integer score band (0-10) to an outcome description.
@@ -623,7 +633,7 @@ type ScoreRange = {
     readonly outcome: string;
 };
 /**
- * Rubric item for LLM judge evaluation.
+ * Rubric item for LLM grader evaluation.
  * Supports two modes:
  * - Checklist mode: boolean satisfied/not-satisfied with `outcome`
  * - Score-range mode: 0-10 integer scoring with `score_ranges`
@@ -648,7 +658,7 @@ type RubricItem = {
     readonly required_min_score?: number;
     /**
      * Score range definitions for analytic rubric scoring.
-     * When present, the judge outputs an integer 0-10 score per criterion.
+     * When present, the grader outputs an integer 0-10 score per criterion.
      * Ranges must be non-overlapping and cover 0-10 inclusive.
      */
     readonly score_ranges?: readonly ScoreRange[];
@@ -656,10 +666,19 @@ type RubricItem = {
 type CompositeAggregatorConfig = {
     readonly type: 'weighted_average';
     readonly weights?: Record<string, number>;
+} | {
+    readonly type: 'code-grader';
+    readonly path: string;
+    readonly cwd?: string;
 } | {
     readonly type: 'code-judge';
     readonly path: string;
     readonly cwd?: string;
+} | {
+    readonly type: 'llm-grader';
+    readonly prompt?: string;
+    readonly promptPath?: string;
+    readonly model?: string;
 } | {
     readonly type: 'llm-judge';
     readonly prompt?: string;
@@ -672,7 +691,7 @@ type CompositeAggregatorConfig = {
 type CompositeEvaluatorConfig = {
     readonly name: string;
     readonly type: 'composite';
-    readonly evaluators: readonly EvaluatorConfig[];
+    readonly assertions: readonly EvaluatorConfig[];
     readonly aggregator: CompositeAggregatorConfig;
     readonly weight?: number;
     readonly required?: boolean | number;
@@ -681,7 +700,7 @@ type CompositeEvaluatorConfig = {
 };
 /**
  * Match type for field accuracy evaluation.
- * Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code-judge evaluator.
+ * Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code-grader evaluator.
  * See examples/features/document-extraction/fuzzy_match.ts for an example.
  */
 type FieldMatchType = 'exact' | 'numeric_tolerance' | 'date';
@@ -796,34 +815,6 @@ type ExecutionMetricsEvaluatorConfig = {
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
-/**
- * Configuration for the agent-judge evaluator.
- * Runs an agentic investigation loop to audit workspaces and verify criteria.
- * Two modes:
- * - Built-in: Uses AI SDK generateText() with sandboxed filesystem tools
- * - Judge target: Delegates to an external agent provider via Provider.invoke()
- */
-type AgentJudgeEvaluatorConfig = {
-    readonly name: string;
-    readonly type: 'agent-judge';
-    /** Custom evaluation prompt (inline text or file path) */
-    readonly prompt?: string;
-    readonly promptPath?: string;
-    /** Resolved absolute path for prompt file */
-    readonly resolvedPromptPath?: string;
-    /** Rubric items for structured evaluation (reuses llm-judge rubric infra) */
-    readonly rubrics?: readonly RubricItem[];
-    /** Maximum agent steps for built-in mode (default 10, max 50) */
-    readonly max_steps?: number;
-    /** Temperature for built-in mode (default 0) */
-    readonly temperature?: number;
-    /** Target name — delegates agent loop to this provider instead of built-in mode */
-    readonly target?: string;
-    readonly weight?: number;
-    readonly required?: boolean | number;
-    /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
-    readonly negate?: boolean;
-};
 /**
  * Configuration for the contains assertion evaluator.
  * Checks whether the candidate output contains a specified substring.
@@ -981,7 +972,34 @@ type RubricsEvaluatorConfig = {
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
 };
-type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | AgentJudgeEvaluatorConfig | ContainsEvaluatorConfig | ContainsAnyEvaluatorConfig | ContainsAllEvaluatorConfig | IcontainsEvaluatorConfig | IcontainsAnyEvaluatorConfig | IcontainsAllEvaluatorConfig | StartsWithEvaluatorConfig | EndsWithEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig;
+/**
+ * Configuration for the skill-trigger evaluator.
+ * Detects whether the agent invoked a named Claude Code skill as its first tool call.
+ * Mirrors the post-hoc fallback detection in skill-creator's run_eval.py.
+ */
+type SkillTriggerEvaluatorConfig = {
+    readonly name: string;
+    readonly type: 'skill-trigger';
+    /** The skill name to check for (case-sensitive substring match) */
+    readonly skill: string;
+    /** Whether the skill is expected to trigger (default: true) */
+    readonly should_trigger?: boolean;
+    readonly weight?: number;
+    readonly required?: boolean | number;
+    readonly negate?: boolean;
+};
+/**
+ * Configuration for the inline-assert evaluator.
+ * Wraps an AssertFn for in-process evaluation via the evaluate() API.
+ */
+type InlineAssertEvaluatorConfig = {
+    readonly name: string;
+    readonly type: 'inline-assert';
+    readonly weight?: number;
+    readonly required?: boolean | number;
+    readonly negate?: boolean;
+};
+type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | SkillTriggerEvaluatorConfig | ContainsEvaluatorConfig | ContainsAnyEvaluatorConfig | ContainsAllEvaluatorConfig | IcontainsEvaluatorConfig | IcontainsAnyEvaluatorConfig | IcontainsAllEvaluatorConfig | StartsWithEvaluatorConfig | EndsWithEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig | InlineAssertEvaluatorConfig;
 /**
  * Eval test definition sourced from AgentV specs.
  */
@@ -999,7 +1017,7 @@ interface EvalTest {
     readonly file_paths: readonly string[];
     readonly criteria: string;
     readonly evaluator?: EvaluatorKind;
-    readonly evaluators?: readonly EvaluatorConfig[];
+    readonly assertions?: readonly EvaluatorConfig[];
     /** Workspace configuration (merged from suite-level and case-level) */
     readonly workspace?: WorkspaceConfig;
     /** Arbitrary metadata passed to workspace scripts via stdin */
@@ -1172,15 +1190,15 @@ interface EvaluatorResult {
     readonly rawRequest?: JsonObject;
     readonly evaluatorProviderRequest?: JsonObject;
     readonly scores?: readonly EvaluatorResult[];
-    /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts). */
+    /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts). */
     readonly details?: JsonObject;
     /** Token usage from LLM calls made by this evaluator (optional). */
     readonly tokenUsage?: TokenUsage;
-    /** Wall-clock duration of this judge execution in milliseconds. */
+    /** Wall-clock duration of this grader execution in milliseconds. */
     readonly durationMs?: number;
-    /** ISO 8601 UTC timestamp when this judge started executing. */
+    /** ISO 8601 UTC timestamp when this grader started executing. */
     readonly startedAt?: string;
-    /** ISO 8601 UTC timestamp when this judge finished executing. */
+    /** ISO 8601 UTC timestamp when this grader finished executing. */
     readonly endedAt?: string;
 }
 /**
@@ -1314,7 +1332,7 @@ declare function buildPromptInputs(testCase: EvalTest, mode?: FormattingMode): P
 /**
  * Detect file format by extension.
  */
-declare function detectFormat(filePath: string): 'yaml' | 'jsonl';
+declare function detectFormat(filePath: string): 'yaml' | 'jsonl' | 'agent-skills-json';
 type LoadOptions = {
     readonly verbose?: boolean;
@@ -1366,6 +1384,83 @@ declare function loadTestById(evalFilePath: string, repoRoot: URL | string, eval
 /** @deprecated Use `loadTestById` instead */
 declare const loadEvalCaseById: typeof loadTestById;
+/**
+ * Raw Agent Skills evals.json schema.
+ * @see https://agentskills.io/skill-creation/evaluating-skills
+ */
+interface AgentSkillsEvalsFile {
+    readonly skill_name?: string;
+    readonly evals: readonly AgentSkillsEvalCase[];
+}
+interface AgentSkillsEvalCase {
+    readonly id: number;
+    readonly prompt: string;
+    readonly expected_output?: string;
+    readonly files?: readonly string[];
+    readonly assertions?: readonly string[];
+}
+/**
+ * Detect whether a JSON file is in Agent Skills evals.json format.
+ * Returns true if the parsed content has an `evals` array.
+ */
+declare function isAgentSkillsFormat(parsed: unknown): parsed is AgentSkillsEvalsFile;
+/**
+ * Parse already-loaded Agent Skills evals data into EvalTest[].
+ * Exported for testing without file I/O.
+ */
+declare function parseAgentSkillsEvals(parsed: unknown, source?: string, baseDir?: string): readonly EvalTest[];
+/**
+ * EVAL.yaml → evals.json transpiler.
+ *
+ * Converts an AgentV EVAL.yaml file into Agent Skills evals.json format
+ * for consumption by the skill-creator pipeline.
+ *
+ * Handles both `assertions:` (current) and `assert:` (deprecated alias).
+ */
+interface EvalsJsonCase {
+    id: number;
+    prompt: string;
+    expected_output?: string;
+    files?: string[];
+    should_trigger?: boolean;
+    assertions: string[];
+}
+interface EvalsJsonFile {
+    skill_name: string;
+    evals: EvalsJsonCase[];
+}
+/**
+ * Result of transpiling a single EVAL.yaml.
+ * May produce multiple evals.json files (one per skill).
+ */
+interface TranspileResult {
+    /** Map from skill_name → EvalsJsonFile */
+    files: Map<string, EvalsJsonFile>;
+    /** Warning messages accumulated during transpilation */
+    warnings: string[];
+}
+/**
+ * Transpile a parsed EVAL.yaml object into one or more evals.json objects.
+ *
+ * @param suite  Parsed YAML object (already loaded, no file I/O here)
+ * @param source Source identifier for error messages (e.g. file path)
+ */
+declare function transpileEvalYaml(suite: unknown, source?: string): TranspileResult;
+/**
+ * Transpile an EVAL.yaml file into one or more evals.json objects.
+ * Returns a map from output filename → JSON content.
+ *
+ * @param evalYamlPath  Absolute path to the EVAL.yaml file
+ */
+declare function transpileEvalYamlFile(evalYamlPath: string): TranspileResult;
+/**
+ * Determine the output filename(s) for a transpile result.
+ * Single skill → "evals.json"
+ * Multiple skills → "<skill>.evals.json"
+ */
+declare function getOutputFilenames(result: TranspileResult): Map<string, string>;
 declare function fileExists(filePath: string): Promise<boolean>;
 /**
  * Normalize line endings to LF (\n).
@@ -1613,87 +1708,112 @@ interface VSCodeResolvedConfig {
     readonly workspaceTemplate?: string;
     readonly timeoutMs?: number;
 }
+interface AgentVResolvedConfig {
+    readonly model: string;
+    readonly temperature: number;
+}
 type ResolvedTarget = {
     readonly kind: 'azure';
     readonly name: string;
-    readonly judgeTarget?: string;
+    readonly graderTarget?: string;
     readonly workers?: number;
     readonly providerBatching?: boolean;
     readonly config: AzureResolvedConfig;
 } | {
     readonly kind: 'anthropic';
     readonly name: string;
-    readonly judgeTarget?: string;
+    readonly graderTarget?: string;
     readonly workers?: number;
     readonly providerBatching?: boolean;
     readonly config: AnthropicResolvedConfig;
 } | {
     readonly kind: 'gemini';
     readonly name: string;
-    readonly judgeTarget?: string;
+    readonly graderTarget?: string;
     readonly workers?: number;
     readonly providerBatching?: boolean;
     readonly config: GeminiResolvedConfig;
 } | {
     readonly kind: 'codex';
     readonly name: string;
-    readonly judgeTarget?: string;
+    readonly graderTarget?: string;
     readonly workers?: number;
     readonly providerBatching?: boolean;
     readonly config: CodexResolvedConfig;
 } | {
     readonly kind: 'copilot-sdk';
     readonly name: string;
-    readonly judgeTarget?: string;
+    readonly graderTarget?: string;
     readonly workers?: number;
     readonly providerBatching?: boolean;
     readonly config: CopilotSdkResolvedConfig;
 } | {
     readonly kind: 'copilot-cli';
     readonly name: string;
-    readonly judgeTarget?: string;
+    readonly graderTarget?: string;
     readonly workers?: number;
     readonly providerBatching?: boolean;
     readonly config: CopilotCliResolvedConfig;
 } | {
     readonly kind: 'pi-coding-agent';
     readonly name: string;
-    readonly judgeTarget?: string;
+    readonly graderTarget?: string;
     readonly workers?: number;
     readonly providerBatching?: boolean;
     readonly config: PiCodingAgentResolvedConfig;
 } | {
     readonly kind: 'pi-agent-sdk';
     readonly name: string;
-    readonly judgeTarget?: string;
+    readonly graderTarget?: string;
     readonly workers?: number;
     readonly providerBatching?: boolean;
     readonly config: PiAgentSdkResolvedConfig;
 } | {
     readonly kind: 'claude';
     readonly name: string;
-    readonly judgeTarget?: string;
+    readonly graderTarget?: string;
+    readonly workers?: number;
+    readonly providerBatching?: boolean;
+    readonly config: ClaudeResolvedConfig;
+} | {
+    readonly kind: 'claude-cli';
+    readonly name: string;
+    readonly graderTarget?: string;
+    readonly workers?: number;
+    readonly providerBatching?: boolean;
+    readonly config: ClaudeResolvedConfig;
+} | {
+    readonly kind: 'claude-sdk';
+    readonly name: string;
+    readonly graderTarget?: string;
     readonly workers?: number;
     readonly providerBatching?: boolean;
     readonly config: ClaudeResolvedConfig;
 } | {
     readonly kind: 'mock';
     readonly name: string;
-    readonly judgeTarget?: string;
+    readonly graderTarget?: string;
     readonly workers?: number;
     readonly providerBatching?: boolean;
     readonly config: MockResolvedConfig;
 } | {
     readonly kind: 'vscode' | 'vscode-insiders';
     readonly name: string;
-    readonly judgeTarget?: string;
+    readonly graderTarget?: string;
     readonly workers?: number;
     readonly providerBatching?: boolean;
     readonly config: VSCodeResolvedConfig;
+} | {
+    readonly kind: 'agentv';
+    readonly name: string;
+    readonly graderTarget?: string;
+    readonly workers?: number;
+    readonly providerBatching?: boolean;
+    readonly config: AgentVResolvedConfig;
 } | {
     readonly kind: 'cli';
     readonly name: string;
-    readonly judgeTarget?: string;
+    readonly graderTarget?: string;
     readonly workers?: number;
     readonly providerBatching?: boolean;
     readonly config: CliResolvedConfig;
@@ -1845,7 +1965,7 @@ declare function resolveAndCreateProvider(definition: TargetDefinition, env?: En
 /**
  * Function to resolve a target name to a provider.
- * Used by code judges to support target override.
+ * Used by code graders to support target override.
  */
 type TargetResolver = (targetName: string) => Provider | undefined;
 interface EvaluationContext {
@@ -1861,6 +1981,8 @@ interface EvaluationContext {
         readonly chatPrompt?: ChatPrompt;
     };
     readonly now: Date;
+    readonly graderProvider?: Provider;
+    /** @deprecated Use `graderProvider` instead */
     readonly judgeProvider?: Provider;
     readonly evaluatorTemplateOverride?: string;
     readonly evaluator?: EvaluatorConfig;
@@ -1878,9 +2000,9 @@ interface EvaluationContext {
     readonly startTime?: string;
     /** ISO 8601 timestamp when execution ended */
     readonly endTime?: string;
-    /** Resolver for target override in code judges */
+    /** Resolver for target override in code graders */
     readonly targetResolver?: TargetResolver;
-    /** List of available target names for code judges */
+    /** List of available target names for code graders */
     readonly availableTargets?: readonly string[];
     /** Unified diff of file changes from workspace (when workspace_template is configured) */
     readonly fileChanges?: string;
@@ -1896,7 +2018,7 @@ interface EvaluationScore {
     readonly reasoning?: string;
     readonly evaluatorRawRequest?: JsonObject;
     readonly scores?: readonly ChildEvaluatorResult[];
-    /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
+    /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */
     readonly details?: JsonObject;
     /** Token usage from LLM calls made by this evaluator (optional). */
     readonly tokenUsage?: TokenUsage;
@@ -1912,7 +2034,7 @@ interface ChildEvaluatorResult {
     readonly reasoning?: string;
     readonly evaluatorRawRequest?: JsonObject;
     readonly scores?: readonly ChildEvaluatorResult[];
-    /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
+    /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */
     readonly details?: JsonObject;
     /** Token usage from LLM calls made by this evaluator (optional). */
     readonly tokenUsage?: TokenUsage;
@@ -2071,12 +2193,18 @@ declare class LatencyEvaluator implements Evaluator {
  * Custom evaluators can override this via evaluatorTemplate option.
  */
 declare const DEFAULT_EVALUATOR_TEMPLATE: string;
-type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
-interface LlmJudgeEvaluatorOptions {
-    readonly resolveJudgeProvider: JudgeProviderResolver;
+type GraderProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
+interface LlmGraderEvaluatorOptions {
+    readonly resolveGraderProvider: GraderProviderResolver;
+    /** @deprecated Use `resolveGraderProvider` instead. */
+    readonly resolveJudgeProvider?: GraderProviderResolver;
     readonly maxOutputTokens?: number;
     readonly temperature?: number;
     readonly evaluatorTemplate?: string;
+    readonly maxSteps?: number;
+    readonly graderTargetProvider?: Provider;
+    /** @deprecated Use `graderTargetProvider` instead. */
+    readonly judgeTargetProvider?: Provider;
 }
 declare const freeformEvaluationSchema: z.ZodObject<{
     score: z.ZodNumber;
@@ -2125,13 +2253,15 @@ declare const rubricEvaluationSchema: z.ZodObject<{
     overall_reasoning: string;
 }>;
-declare class LlmJudgeEvaluator implements Evaluator {
-    readonly kind = "llm-judge";
-    private readonly resolveJudgeProvider;
+declare class LlmGraderEvaluator implements Evaluator {
+    readonly kind = "llm-grader";
+    private readonly resolveGraderProvider;
     private readonly maxOutputTokens?;
     private readonly temperature?;
     private readonly evaluatorTemplate?;
-    constructor(options: LlmJudgeEvaluatorOptions);
+    private readonly maxSteps;
+    private readonly graderTargetProvider?;
+    constructor(options: LlmGraderEvaluatorOptions);
     evaluate(context: EvaluationContext): Promise<EvaluationScore>;
     private evaluateFreeform;
     private evaluateWithRubrics;
@@ -2140,6 +2270,43 @@ declare class LlmJudgeEvaluator implements Evaluator {
      * Each criterion is scored 0-10 and normalized to 0-1.
      */
     private evaluateWithScoreRanges;
+    /**
+     * Built-in mode: Uses Vercel AI SDK generateText() with sandboxed filesystem tools.
+     */
+    private evaluateBuiltIn;
+    /**
+     * Grader target mode: Delegates to an explicit graderTargetProvider via Provider.invoke().
+     */
+    private evaluateWithGraderTarget;
+    /**
+     * Delegate mode: resolved provider is an agent provider — send prompt via invoke().
+     */
+    private evaluateWithDelegatedAgent;
+    /**
+     * Shared implementation for grader_target and delegate modes.
+     * Both invoke a provider and parse the agent result from the response.
+     */
+    private evaluateWithDelegate;
+    /**
+     * Build system prompt for built-in agent mode.
+     * Includes output format instructions.
+     */
+    private buildAgentSystemPrompt;
+    /**
+     * Build user prompt for built-in agent mode.
+     * Uses custom template if provided, otherwise builds default prompt.
+     */
+    private buildAgentUserPrompt;
+    /**
+     * Build the full evaluation prompt for delegate mode (agent providers).
+     * Combines task context, criteria, candidate info, and output format instructions.
+     */
+    private buildDelegatedPrompt;
+    /**
+     * Parse the agent's response text into an EvaluationScore.
+     * Supports both freeform and rubric modes.
+     */
+    private parseAgentResult;
     /**
      * Build prompt for score-range rubric evaluation.
      */
@@ -2165,67 +2332,40 @@ declare function calculateRubricScore(result: z.infer<typeof rubricEvaluationSch
  */
 declare function buildScoreRangeOutputSchema(): string;
-interface AgentJudgeEvaluatorOptions {
-    readonly resolveJudgeProvider: (ctx: EvaluationContext) => Promise<Provider | undefined>;
-    readonly maxSteps?: number;
-    readonly temperature?: number;
-    readonly evaluatorTemplate?: string;
-    readonly judgeTargetProvider?: Provider;
-}
-declare class AgentJudgeEvaluator implements Evaluator {
-    readonly kind = "agent-judge";
-    private readonly resolveJudgeProvider;
-    private readonly maxSteps;
-    private readonly temperature;
-    private readonly evaluatorTemplate?;
-    private readonly judgeTargetProvider?;
-    constructor(options: AgentJudgeEvaluatorOptions);
-    evaluate(context: EvaluationContext): Promise<EvaluationScore>;
-    /**
-     * Built-in mode: Uses Vercel AI SDK generateText() with sandboxed filesystem tools.
-     */
-    private evaluateBuiltIn;
-    /**
-     * Judge target mode: Delegates to an external agent provider via Provider.invoke().
-     */
-    private evaluateWithJudgeTarget;
-    /**
-     * Parse the agent's response text into an EvaluationScore.
-     * Supports both freeform and rubric modes.
-     */
-    private parseResult;
-    /**
-     * Build system prompt for built-in mode.
-     * Includes output format instructions.
-     */
-    private buildSystemPrompt;
-    /**
-     * Build user prompt for built-in mode.
-     * Uses custom template if provided, otherwise builds default prompt.
-     */
-    private buildUserPrompt;
-    /**
-     * Build the full evaluation prompt for judge target mode (delegation).
-     * Combines task context, criteria, candidate info, and output format instructions.
-     */
-    private buildDelegatedPrompt;
+/**
+ * Built-in skill-trigger evaluator.
+ *
+ * Detects whether the agent invoked a named Claude Code skill as its first tool call.
+ * Mirrors the post-hoc fallback detection in skill-creator's run_eval.py:
+ *   - Only the FIRST tool call matters.
+ *   - Skill tool: checks input.skill contains the skill name (case-sensitive substring).
+ *   - Read tool: checks input.file_path contains the skill name (case-sensitive substring).
+ *   - Any other tool as first call means the skill was not triggered.
+ *   - Supports negative cases via should_trigger: false.
+ */
+declare class SkillTriggerEvaluator implements Evaluator {
+    readonly kind = "skill-trigger";
+    private readonly config;
+    constructor(config: SkillTriggerEvaluatorConfig);
+    evaluate(context: EvaluationContext): EvaluationScore;
 }
-interface LlmJudgePromptAssembly {
+interface LlmGraderPromptAssembly {
     systemPrompt: string;
     userPrompt: string;
     responseSchema: string;
     mode: 'freeform' | 'checklist' | 'score_range';
 }
-declare function assembleLlmJudgePrompt(input: {
+declare function assembleLlmGraderPrompt(input: {
     evalCase: EvalTest;
     candidate: string;
     promptInputs: PromptInputs;
-    evaluatorConfig?: LlmJudgeEvaluatorConfig;
+    evaluatorConfig?: LlmGraderEvaluatorConfig;
     output?: readonly Message[];
     fileChanges?: string;
     evaluatorTemplateOverride?: string;
-}): LlmJudgePromptAssembly;
+}): LlmGraderPromptAssembly;
 interface TokenUsageEvaluatorOptions {
     readonly config: TokenUsageEvaluatorConfig;
@@ -2322,18 +2462,22 @@ declare function runEqualsAssertion(output: string, value: string): AssertionRes
  * Contains shared resources needed by evaluator instances.
  */
 interface EvaluatorDispatchContext {
-    /** Shared LLM judge provider (resolved at suite level) */
+    /** Shared LLM grader provider (resolved at suite level) */
+    readonly graderProvider?: Provider;
+    /** @deprecated Use `graderProvider` instead */
     readonly judgeProvider?: Provider;
     /** Function to resolve target names to providers */
     readonly targetResolver?: TargetResolver;
-    /** Available target names for code judges */
+    /** Available target names for code graders */
     readonly availableTargets?: readonly string[];
     /** Agent timeout in ms */
     readonly agentTimeoutMs?: number;
     /** Directory containing the eval file (for composite member resolution) */
     readonly evalFileDir?: string;
-    /** Shared LLM judge evaluator instance */
-    readonly llmJudge: Evaluator;
+    /** Shared LLM grader evaluator instance */
+    readonly llmGrader: Evaluator;
+    /** @deprecated Use `llmGrader` instead */
+    readonly llmJudge?: Evaluator;
     /** Reference to the registry itself (for composite evaluators that need to create children) */
     readonly registry: EvaluatorRegistry;
 }
@@ -2341,8 +2485,8 @@ interface EvaluatorDispatchContext {
  * Factory function that creates an Evaluator instance from a config.
  *
  * Factory functions handle all type-specific initialization logic:
- * - Reading prompt files for LLM judges
- * - Resolving script paths for code judges
+ * - Reading prompt files for LLM graders
+ * - Resolving script paths for code graders
  * - Creating adapter evaluators for deterministic assertions
  */
 type EvaluatorFactoryFn = (config: EvaluatorConfig, context: EvaluatorDispatchContext) => Evaluator | Promise<Evaluator>;
@@ -2404,7 +2548,7 @@ interface RunEvalCaseOptions {
     readonly provider: Provider;
     readonly target: ResolvedTarget;
     readonly evaluators: Partial<Record<string, Evaluator>> & {
-        readonly 'llm-judge': Evaluator;
+        readonly 'llm-grader': Evaluator;
     };
     readonly now?: () => Date;
     readonly maxRetries?: number;
@@ -2412,10 +2556,10 @@ interface RunEvalCaseOptions {
     readonly cache?: EvaluationCache;
     readonly useCache?: boolean;
     readonly signal?: AbortSignal;
-    readonly judgeProvider?: Provider;
-    /** Resolver for target override in code judges */
+    readonly graderProvider?: Provider;
+    /** Resolver for target override in code graders */
     readonly targetResolver?: (name: string) => Provider | undefined;
-    /** List of available target names for code judges */
+    /** List of available target names for code graders */
     readonly availableTargets?: readonly string[];
     /** Unique identifier for the evaluation run (used for workspace management) */
     readonly evalRunId?: string;
@@ -2498,10 +2642,44 @@ interface RunEvaluationOptions {
     readonly retainOnSuccess?: 'keep' | 'cleanup';
     /** Retention policy override for failed cases */
     readonly retainOnFailure?: 'keep' | 'cleanup';
+    /** CLI override: grader target name (e.g., "agentv" or a target from targets.yaml) */
+    readonly graderTarget?: string;
+    /** CLI override: model for grader target (e.g., "openai:gpt-5-mini") */
+    readonly model?: string;
 }
 declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
 declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
+/**
+ * Types for inline assertion functions used in the evaluate() API.
+ *
+ * Inline functions are the escape hatch for custom evaluation logic
+ * that doesn't fit a built-in evaluator type. For built-in assertions
+ * (contains, regex, is-json, etc.), use config objects instead:
+ *
+ *   assert: [{ type: 'contains', value: 'hello' }]
+ *
+ * Inline functions are for custom logic:
+ *
+ *   assert: [({ output }) => ({ name: 'len', score: output.length > 5 ? 1 : 0 })]
+ */
+/** Context passed to inline assertion functions */
+interface AssertContext {
+    readonly input: string;
+    readonly output: string;
+    readonly expectedOutput?: string;
+    readonly criteria?: string;
+    readonly metadata?: Record<string, unknown>;
+}
+/** Result from an inline assertion function */
+interface AssertResult {
+    readonly name: string;
+    readonly score: number;
+    readonly metadata?: Record<string, unknown>;
+}
+/** Inline assertion function signature */
+type AssertFn = (ctx: AssertContext) => AssertResult | Promise<AssertResult>;
 /**
  * Programmatic API for running evaluations.
  *
@@ -2509,7 +2687,7 @@ declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationRes
  * instead of a CLI. The config shape mirrors the YAML structure for easy
  * translation between file-based and programmatic usage.
  *
- * @example Inline tests
+ * @example Inline tests with config objects
  * ```typescript
  * import { evaluate } from '@agentv/core';
  *
@@ -2518,7 +2696,7 @@ declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationRes
  *     {
  *       id: 'capital',
  *       input: 'What is the capital of France?',
- *       expected_output: 'Paris',
+ *       expectedOutput: 'Paris',
  *       assert: [{ type: 'contains', value: 'Paris' }],
  *     },
  *   ],
@@ -2528,6 +2706,27 @@ declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationRes
  * console.log(results.summary.passed, 'passed');
  * ```
  *
+ * @example Inline tests with task function and custom assertion
+ * ```typescript
+ * import { evaluate } from '@agentv/core';
+ *
+ * const { summary } = await evaluate({
+ *   tests: [
+ *     {
+ *       id: 'echo',
+ *       input: 'hello',
+ *       expectedOutput: 'Echo: hello',
+ *       assert: [
+ *         { type: 'contains', value: 'hello' },
+ *         { type: 'equals' },
+ *         ({ output }) => ({ name: 'custom', score: output.length > 0 ? 1 : 0 }),
+ *       ],
+ *     },
+ *   ],
+ *   task: async (input) => `Echo: ${input}`,
+ * });
+ * ```
+ *
  * @example File-based
  * ```typescript
  * const results = await evaluate({
@@ -2553,10 +2752,12 @@ interface EvalTestInput {
         role: string;
         content: string;
     }[];
-    /** Expected reference output */
+    /** Expected reference output (camelCase preferred) */
+    readonly expectedOutput?: string;
+    /** @deprecated Use `expectedOutput` instead */
     readonly expected_output?: string;
-    /** Assertion evaluators */
-    readonly assert?: readonly EvalAssertionInput[];
+    /** Assertion evaluators — accepts factory functions, config objects, or inline functions */
+    readonly assert?: readonly AssertEntry[];
     /** Arbitrary metadata */
     readonly metadata?: Record<string, unknown>;
 }
@@ -2592,6 +2793,8 @@ interface EvalAssertionInput {
     /** Additional properties */
     readonly [key: string]: unknown;
 }
+/** Assert entry: inline function or config object */
+type AssertEntry = AssertFn | EvalAssertionInput;
 /**
  * Configuration for `evaluate()`.
  * Accepts either inline tests or a spec file path.
@@ -2603,8 +2806,10 @@ interface EvalConfig {
     readonly specFile?: string;
     /** Target provider configuration */
     readonly target?: TargetDefinition;
+    /** Custom task function — mutually exclusive with target */
+    readonly task?: (input: string) => string | Promise<string>;
     /** Suite-level assertions applied to all tests */
-    readonly assert?: readonly EvalAssertionInput[];
+    readonly assert?: readonly AssertEntry[];
     /** Filter tests by ID pattern (glob supported) */
     readonly filter?: string;
     /** Maximum concurrent workers (default: 3) */
@@ -3298,9 +3503,29 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
  */
 declare function discoverAssertions(registry: EvaluatorRegistry, baseDir: string): Promise<string[]>;
+/**
+ * Convention-based discovery of custom grader scripts.
+ *
+ * Scans `.agentv/graders/` (and legacy `.agentv/judges/`) for TypeScript/JavaScript
+ * files and registers them as code-grader evaluators in the registry. The file name
+ * (without extension) becomes the evaluator type name.
+ *
+ * Example: `.agentv/graders/custom-grader.ts` → type "custom-grader" in EVAL.yaml
+ */
+/**
+ * Discover custom grader scripts from `.agentv/graders/` (and legacy `.agentv/judges/`)
+ * and register them as evaluator types in the registry.
+ *
+ * @param registry - The evaluator registry to register discovered graders into
+ * @param baseDir - The base directory to search from (typically project root or eval file dir)
+ * @returns Names of discovered grader types
+ */
+declare function discoverGraders(registry: EvaluatorRegistry, baseDir: string): Promise<string[]>;
 type AgentKernel = {
     status: string;
 };
 declare function createAgentKernel(): AgentKernel;
-export { type AcquireWorkspaceOptions, AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getHitCount, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
+export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getHitCount, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };