@agentv/core 2.19.0 → 3.0.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -8,7 +8,7 @@ interface ChatMessage {
8
8
  readonly name?: string;
9
9
  }
10
10
  type ChatPrompt = readonly ChatMessage[];
11
- type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-sdk' | 'copilot-cli' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
11
+ type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-sdk' | 'copilot-cli' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude' | 'claude-cli' | 'claude-sdk' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders' | 'agentv';
12
12
  /** Callbacks for real-time observability during provider execution */
13
13
  interface ProviderStreamCallbacks {
14
14
  onToolCallStart?: (toolName: string, toolCallId?: string) => void;
@@ -144,6 +144,8 @@ type EnvLookup = Readonly<Record<string, string | undefined>>;
144
144
  interface TargetDefinition {
145
145
  readonly name: string;
146
146
  readonly provider: ProviderKind | string;
147
+ readonly grader_target?: string | undefined;
148
+ /** @deprecated Use `grader_target` instead */
147
149
  readonly judge_target?: string | undefined;
148
150
  readonly workers?: number | undefined;
149
151
  readonly provider_batching?: boolean | undefined;
@@ -463,11 +465,11 @@ declare function isJsonValue(value: unknown): value is JsonValue;
463
465
  * - Either content (string or array of objects) OR tool_calls (for assistant messages)
464
466
  */
465
467
  declare function isTestMessage(value: unknown): value is TestMessage;
466
- declare const EVALUATOR_KIND_VALUES: readonly ["code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "agent-judge", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics"];
468
+ declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
467
469
  type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
468
470
  declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
469
471
  /**
470
- * Configuration for enabling target access in code-judge evaluators.
472
+ * Configuration for enabling target access in code-grader evaluators.
471
473
  * When present, the runtime will start a local proxy server that allows
472
474
  * the script to invoke configured targets without direct credential access.
473
475
  */
@@ -566,7 +568,7 @@ type WorkspaceConfig = {
566
568
  };
567
569
  type CodeEvaluatorConfig = {
568
570
  readonly name: string;
569
- readonly type: 'code-judge';
571
+ readonly type: 'code-judge' | 'code-grader';
570
572
  readonly command: readonly string[];
571
573
  /** @deprecated Use `command` instead */
572
574
  readonly script?: readonly string[];
@@ -577,14 +579,14 @@ type CodeEvaluatorConfig = {
577
579
  readonly required?: boolean | number;
578
580
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
579
581
  readonly negate?: boolean;
580
- /** Pass-through configuration for the code-judge (any unrecognized YAML properties) */
582
+ /** Pass-through configuration for the code-grader (any unrecognized YAML properties) */
581
583
  readonly config?: JsonObject;
582
584
  /** When present, enables target access via local proxy */
583
585
  readonly target?: TargetAccessConfig;
584
586
  };
585
587
  /**
586
588
  * Executable prompt template configuration.
587
- * Matches code-judge pattern for consistency.
589
+ * Matches code-grader pattern for consistency.
588
590
  */
589
591
  type PromptScriptConfig = {
590
592
  /** Command array to execute (e.g., ["bun", "run", "template.ts"]) */
@@ -594,24 +596,32 @@ type PromptScriptConfig = {
594
596
  /** Pass-through configuration for the prompt template */
595
597
  readonly config?: Record<string, unknown>;
596
598
  };
597
- type LlmJudgeEvaluatorConfig = {
599
+ type LlmGraderEvaluatorConfig = {
598
600
  readonly name: string;
599
- readonly type: 'llm-judge';
601
+ readonly type: 'llm-grader' | 'llm-judge';
600
602
  /** Text prompt (inline or file path) or executable script config */
601
603
  readonly prompt?: string | PromptScriptConfig;
602
604
  readonly promptPath?: string;
603
605
  /** Resolved absolute path for prompt file (used for text template prompts) */
604
606
  readonly resolvedPromptPath?: string;
605
- /** Resolved script array for executable prompts (matches code-judge pattern) */
607
+ /** Resolved script array for executable prompts (matches code-grader pattern) */
606
608
  readonly resolvedPromptScript?: readonly string[];
607
609
  readonly rubrics?: readonly RubricItem[];
608
610
  readonly weight?: number;
609
611
  readonly required?: boolean | number;
610
612
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
611
613
  readonly negate?: boolean;
614
+ /** Optional target override for this grader (uses a named LLM target from targets.yaml). */
615
+ readonly target?: string;
612
616
  /** Pass-through configuration for custom evaluator prompts (legacy, prefer prompt.config) */
613
617
  readonly config?: Record<string, unknown>;
618
+ /** Maximum agent steps for agentv built-in mode (default 10, max 50). Ignored in LLM mode. */
619
+ readonly max_steps?: number;
620
+ /** Temperature override for grader calls */
621
+ readonly temperature?: number;
614
622
  };
623
+ /** @deprecated Use `LlmGraderEvaluatorConfig` instead */
624
+ type LlmJudgeEvaluatorConfig = LlmGraderEvaluatorConfig;
615
625
  /**
616
626
  * Score range definition for analytic rubric scoring.
617
627
  * Each range maps an integer score band (0-10) to an outcome description.
@@ -623,7 +633,7 @@ type ScoreRange = {
623
633
  readonly outcome: string;
624
634
  };
625
635
  /**
626
- * Rubric item for LLM judge evaluation.
636
+ * Rubric item for LLM grader evaluation.
627
637
  * Supports two modes:
628
638
  * - Checklist mode: boolean satisfied/not-satisfied with `outcome`
629
639
  * - Score-range mode: 0-10 integer scoring with `score_ranges`
@@ -648,7 +658,7 @@ type RubricItem = {
648
658
  readonly required_min_score?: number;
649
659
  /**
650
660
  * Score range definitions for analytic rubric scoring.
651
- * When present, the judge outputs an integer 0-10 score per criterion.
661
+ * When present, the grader outputs an integer 0-10 score per criterion.
652
662
  * Ranges must be non-overlapping and cover 0-10 inclusive.
653
663
  */
654
664
  readonly score_ranges?: readonly ScoreRange[];
@@ -656,10 +666,19 @@ type RubricItem = {
656
666
  type CompositeAggregatorConfig = {
657
667
  readonly type: 'weighted_average';
658
668
  readonly weights?: Record<string, number>;
669
+ } | {
670
+ readonly type: 'code-grader';
671
+ readonly path: string;
672
+ readonly cwd?: string;
659
673
  } | {
660
674
  readonly type: 'code-judge';
661
675
  readonly path: string;
662
676
  readonly cwd?: string;
677
+ } | {
678
+ readonly type: 'llm-grader';
679
+ readonly prompt?: string;
680
+ readonly promptPath?: string;
681
+ readonly model?: string;
663
682
  } | {
664
683
  readonly type: 'llm-judge';
665
684
  readonly prompt?: string;
@@ -672,7 +691,7 @@ type CompositeAggregatorConfig = {
672
691
  type CompositeEvaluatorConfig = {
673
692
  readonly name: string;
674
693
  readonly type: 'composite';
675
- readonly evaluators: readonly EvaluatorConfig[];
694
+ readonly assertions: readonly EvaluatorConfig[];
676
695
  readonly aggregator: CompositeAggregatorConfig;
677
696
  readonly weight?: number;
678
697
  readonly required?: boolean | number;
@@ -681,7 +700,7 @@ type CompositeEvaluatorConfig = {
681
700
  };
682
701
  /**
683
702
  * Match type for field accuracy evaluation.
684
- * Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code-judge evaluator.
703
+ * Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code-grader evaluator.
685
704
  * See examples/features/document-extraction/fuzzy_match.ts for an example.
686
705
  */
687
706
  type FieldMatchType = 'exact' | 'numeric_tolerance' | 'date';
@@ -796,34 +815,6 @@ type ExecutionMetricsEvaluatorConfig = {
796
815
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
797
816
  readonly negate?: boolean;
798
817
  };
799
- /**
800
- * Configuration for the agent-judge evaluator.
801
- * Runs an agentic investigation loop to audit workspaces and verify criteria.
802
- * Two modes:
803
- * - Built-in: Uses AI SDK generateText() with sandboxed filesystem tools
804
- * - Judge target: Delegates to an external agent provider via Provider.invoke()
805
- */
806
- type AgentJudgeEvaluatorConfig = {
807
- readonly name: string;
808
- readonly type: 'agent-judge';
809
- /** Custom evaluation prompt (inline text or file path) */
810
- readonly prompt?: string;
811
- readonly promptPath?: string;
812
- /** Resolved absolute path for prompt file */
813
- readonly resolvedPromptPath?: string;
814
- /** Rubric items for structured evaluation (reuses llm-judge rubric infra) */
815
- readonly rubrics?: readonly RubricItem[];
816
- /** Maximum agent steps for built-in mode (default 10, max 50) */
817
- readonly max_steps?: number;
818
- /** Temperature for built-in mode (default 0) */
819
- readonly temperature?: number;
820
- /** Target name — delegates agent loop to this provider instead of built-in mode */
821
- readonly target?: string;
822
- readonly weight?: number;
823
- readonly required?: boolean | number;
824
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
825
- readonly negate?: boolean;
826
- };
827
818
  /**
828
819
  * Configuration for the contains assertion evaluator.
829
820
  * Checks whether the candidate output contains a specified substring.
@@ -981,7 +972,34 @@ type RubricsEvaluatorConfig = {
981
972
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
982
973
  readonly negate?: boolean;
983
974
  };
984
- type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | AgentJudgeEvaluatorConfig | ContainsEvaluatorConfig | ContainsAnyEvaluatorConfig | ContainsAllEvaluatorConfig | IcontainsEvaluatorConfig | IcontainsAnyEvaluatorConfig | IcontainsAllEvaluatorConfig | StartsWithEvaluatorConfig | EndsWithEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig;
975
+ /**
976
+ * Configuration for the skill-trigger evaluator.
977
+ * Detects whether the agent invoked a named Claude Code skill as its first tool call.
978
+ * Mirrors the post-hoc fallback detection in skill-creator's run_eval.py.
979
+ */
980
+ type SkillTriggerEvaluatorConfig = {
981
+ readonly name: string;
982
+ readonly type: 'skill-trigger';
983
+ /** The skill name to check for (case-sensitive substring match) */
984
+ readonly skill: string;
985
+ /** Whether the skill is expected to trigger (default: true) */
986
+ readonly should_trigger?: boolean;
987
+ readonly weight?: number;
988
+ readonly required?: boolean | number;
989
+ readonly negate?: boolean;
990
+ };
991
+ /**
992
+ * Configuration for the inline-assert evaluator.
993
+ * Wraps an AssertFn for in-process evaluation via the evaluate() API.
994
+ */
995
+ type InlineAssertEvaluatorConfig = {
996
+ readonly name: string;
997
+ readonly type: 'inline-assert';
998
+ readonly weight?: number;
999
+ readonly required?: boolean | number;
1000
+ readonly negate?: boolean;
1001
+ };
1002
+ type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | SkillTriggerEvaluatorConfig | ContainsEvaluatorConfig | ContainsAnyEvaluatorConfig | ContainsAllEvaluatorConfig | IcontainsEvaluatorConfig | IcontainsAnyEvaluatorConfig | IcontainsAllEvaluatorConfig | StartsWithEvaluatorConfig | EndsWithEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig | InlineAssertEvaluatorConfig;
985
1003
  /**
986
1004
  * Eval test definition sourced from AgentV specs.
987
1005
  */
@@ -999,7 +1017,7 @@ interface EvalTest {
999
1017
  readonly file_paths: readonly string[];
1000
1018
  readonly criteria: string;
1001
1019
  readonly evaluator?: EvaluatorKind;
1002
- readonly evaluators?: readonly EvaluatorConfig[];
1020
+ readonly assertions?: readonly EvaluatorConfig[];
1003
1021
  /** Workspace configuration (merged from suite-level and case-level) */
1004
1022
  readonly workspace?: WorkspaceConfig;
1005
1023
  /** Arbitrary metadata passed to workspace scripts via stdin */
@@ -1172,15 +1190,15 @@ interface EvaluatorResult {
1172
1190
  readonly rawRequest?: JsonObject;
1173
1191
  readonly evaluatorProviderRequest?: JsonObject;
1174
1192
  readonly scores?: readonly EvaluatorResult[];
1175
- /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts). */
1193
+ /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts). */
1176
1194
  readonly details?: JsonObject;
1177
1195
  /** Token usage from LLM calls made by this evaluator (optional). */
1178
1196
  readonly tokenUsage?: TokenUsage;
1179
- /** Wall-clock duration of this judge execution in milliseconds. */
1197
+ /** Wall-clock duration of this grader execution in milliseconds. */
1180
1198
  readonly durationMs?: number;
1181
- /** ISO 8601 UTC timestamp when this judge started executing. */
1199
+ /** ISO 8601 UTC timestamp when this grader started executing. */
1182
1200
  readonly startedAt?: string;
1183
- /** ISO 8601 UTC timestamp when this judge finished executing. */
1201
+ /** ISO 8601 UTC timestamp when this grader finished executing. */
1184
1202
  readonly endedAt?: string;
1185
1203
  }
1186
1204
  /**
@@ -1314,7 +1332,7 @@ declare function buildPromptInputs(testCase: EvalTest, mode?: FormattingMode): P
1314
1332
  /**
1315
1333
  * Detect file format by extension.
1316
1334
  */
1317
- declare function detectFormat(filePath: string): 'yaml' | 'jsonl';
1335
+ declare function detectFormat(filePath: string): 'yaml' | 'jsonl' | 'agent-skills-json';
1318
1336
 
1319
1337
  type LoadOptions = {
1320
1338
  readonly verbose?: boolean;
@@ -1366,6 +1384,83 @@ declare function loadTestById(evalFilePath: string, repoRoot: URL | string, eval
1366
1384
  /** @deprecated Use `loadTestById` instead */
1367
1385
  declare const loadEvalCaseById: typeof loadTestById;
1368
1386
 
1387
+ /**
1388
+ * Raw Agent Skills evals.json schema.
1389
+ * @see https://agentskills.io/skill-creation/evaluating-skills
1390
+ */
1391
+ interface AgentSkillsEvalsFile {
1392
+ readonly skill_name?: string;
1393
+ readonly evals: readonly AgentSkillsEvalCase[];
1394
+ }
1395
+ interface AgentSkillsEvalCase {
1396
+ readonly id: number;
1397
+ readonly prompt: string;
1398
+ readonly expected_output?: string;
1399
+ readonly files?: readonly string[];
1400
+ readonly assertions?: readonly string[];
1401
+ }
1402
+ /**
1403
+ * Detect whether a JSON file is in Agent Skills evals.json format.
1404
+ * Returns true if the parsed content has an `evals` array.
1405
+ */
1406
+ declare function isAgentSkillsFormat(parsed: unknown): parsed is AgentSkillsEvalsFile;
1407
+ /**
1408
+ * Parse already-loaded Agent Skills evals data into EvalTest[].
1409
+ * Exported for testing without file I/O.
1410
+ */
1411
+ declare function parseAgentSkillsEvals(parsed: unknown, source?: string, baseDir?: string): readonly EvalTest[];
1412
+
1413
+ /**
1414
+ * EVAL.yaml → evals.json transpiler.
1415
+ *
1416
+ * Converts an AgentV EVAL.yaml file into Agent Skills evals.json format
1417
+ * for consumption by the skill-creator pipeline.
1418
+ *
1419
+ * Handles both `assertions:` (current) and `assert:` (deprecated alias).
1420
+ */
1421
+ interface EvalsJsonCase {
1422
+ id: number;
1423
+ prompt: string;
1424
+ expected_output?: string;
1425
+ files?: string[];
1426
+ should_trigger?: boolean;
1427
+ assertions: string[];
1428
+ }
1429
+ interface EvalsJsonFile {
1430
+ skill_name: string;
1431
+ evals: EvalsJsonCase[];
1432
+ }
1433
+ /**
1434
+ * Result of transpiling a single EVAL.yaml.
1435
+ * May produce multiple evals.json files (one per skill).
1436
+ */
1437
+ interface TranspileResult {
1438
+ /** Map from skill_name → EvalsJsonFile */
1439
+ files: Map<string, EvalsJsonFile>;
1440
+ /** Warning messages accumulated during transpilation */
1441
+ warnings: string[];
1442
+ }
1443
+ /**
1444
+ * Transpile a parsed EVAL.yaml object into one or more evals.json objects.
1445
+ *
1446
+ * @param suite Parsed YAML object (already loaded, no file I/O here)
1447
+ * @param source Source identifier for error messages (e.g. file path)
1448
+ */
1449
+ declare function transpileEvalYaml(suite: unknown, source?: string): TranspileResult;
1450
+ /**
1451
+ * Transpile an EVAL.yaml file into one or more evals.json objects.
1452
+ * Returns a map from output filename → JSON content.
1453
+ *
1454
+ * @param evalYamlPath Absolute path to the EVAL.yaml file
1455
+ */
1456
+ declare function transpileEvalYamlFile(evalYamlPath: string): TranspileResult;
1457
+ /**
1458
+ * Determine the output filename(s) for a transpile result.
1459
+ * Single skill → "evals.json"
1460
+ * Multiple skills → "<skill>.evals.json"
1461
+ */
1462
+ declare function getOutputFilenames(result: TranspileResult): Map<string, string>;
1463
+
1369
1464
  declare function fileExists(filePath: string): Promise<boolean>;
1370
1465
  /**
1371
1466
  * Normalize line endings to LF (\n).
@@ -1613,87 +1708,112 @@ interface VSCodeResolvedConfig {
1613
1708
  readonly workspaceTemplate?: string;
1614
1709
  readonly timeoutMs?: number;
1615
1710
  }
1711
+ interface AgentVResolvedConfig {
1712
+ readonly model: string;
1713
+ readonly temperature: number;
1714
+ }
1616
1715
  type ResolvedTarget = {
1617
1716
  readonly kind: 'azure';
1618
1717
  readonly name: string;
1619
- readonly judgeTarget?: string;
1718
+ readonly graderTarget?: string;
1620
1719
  readonly workers?: number;
1621
1720
  readonly providerBatching?: boolean;
1622
1721
  readonly config: AzureResolvedConfig;
1623
1722
  } | {
1624
1723
  readonly kind: 'anthropic';
1625
1724
  readonly name: string;
1626
- readonly judgeTarget?: string;
1725
+ readonly graderTarget?: string;
1627
1726
  readonly workers?: number;
1628
1727
  readonly providerBatching?: boolean;
1629
1728
  readonly config: AnthropicResolvedConfig;
1630
1729
  } | {
1631
1730
  readonly kind: 'gemini';
1632
1731
  readonly name: string;
1633
- readonly judgeTarget?: string;
1732
+ readonly graderTarget?: string;
1634
1733
  readonly workers?: number;
1635
1734
  readonly providerBatching?: boolean;
1636
1735
  readonly config: GeminiResolvedConfig;
1637
1736
  } | {
1638
1737
  readonly kind: 'codex';
1639
1738
  readonly name: string;
1640
- readonly judgeTarget?: string;
1739
+ readonly graderTarget?: string;
1641
1740
  readonly workers?: number;
1642
1741
  readonly providerBatching?: boolean;
1643
1742
  readonly config: CodexResolvedConfig;
1644
1743
  } | {
1645
1744
  readonly kind: 'copilot-sdk';
1646
1745
  readonly name: string;
1647
- readonly judgeTarget?: string;
1746
+ readonly graderTarget?: string;
1648
1747
  readonly workers?: number;
1649
1748
  readonly providerBatching?: boolean;
1650
1749
  readonly config: CopilotSdkResolvedConfig;
1651
1750
  } | {
1652
1751
  readonly kind: 'copilot-cli';
1653
1752
  readonly name: string;
1654
- readonly judgeTarget?: string;
1753
+ readonly graderTarget?: string;
1655
1754
  readonly workers?: number;
1656
1755
  readonly providerBatching?: boolean;
1657
1756
  readonly config: CopilotCliResolvedConfig;
1658
1757
  } | {
1659
1758
  readonly kind: 'pi-coding-agent';
1660
1759
  readonly name: string;
1661
- readonly judgeTarget?: string;
1760
+ readonly graderTarget?: string;
1662
1761
  readonly workers?: number;
1663
1762
  readonly providerBatching?: boolean;
1664
1763
  readonly config: PiCodingAgentResolvedConfig;
1665
1764
  } | {
1666
1765
  readonly kind: 'pi-agent-sdk';
1667
1766
  readonly name: string;
1668
- readonly judgeTarget?: string;
1767
+ readonly graderTarget?: string;
1669
1768
  readonly workers?: number;
1670
1769
  readonly providerBatching?: boolean;
1671
1770
  readonly config: PiAgentSdkResolvedConfig;
1672
1771
  } | {
1673
1772
  readonly kind: 'claude';
1674
1773
  readonly name: string;
1675
- readonly judgeTarget?: string;
1774
+ readonly graderTarget?: string;
1775
+ readonly workers?: number;
1776
+ readonly providerBatching?: boolean;
1777
+ readonly config: ClaudeResolvedConfig;
1778
+ } | {
1779
+ readonly kind: 'claude-cli';
1780
+ readonly name: string;
1781
+ readonly graderTarget?: string;
1782
+ readonly workers?: number;
1783
+ readonly providerBatching?: boolean;
1784
+ readonly config: ClaudeResolvedConfig;
1785
+ } | {
1786
+ readonly kind: 'claude-sdk';
1787
+ readonly name: string;
1788
+ readonly graderTarget?: string;
1676
1789
  readonly workers?: number;
1677
1790
  readonly providerBatching?: boolean;
1678
1791
  readonly config: ClaudeResolvedConfig;
1679
1792
  } | {
1680
1793
  readonly kind: 'mock';
1681
1794
  readonly name: string;
1682
- readonly judgeTarget?: string;
1795
+ readonly graderTarget?: string;
1683
1796
  readonly workers?: number;
1684
1797
  readonly providerBatching?: boolean;
1685
1798
  readonly config: MockResolvedConfig;
1686
1799
  } | {
1687
1800
  readonly kind: 'vscode' | 'vscode-insiders';
1688
1801
  readonly name: string;
1689
- readonly judgeTarget?: string;
1802
+ readonly graderTarget?: string;
1690
1803
  readonly workers?: number;
1691
1804
  readonly providerBatching?: boolean;
1692
1805
  readonly config: VSCodeResolvedConfig;
1806
+ } | {
1807
+ readonly kind: 'agentv';
1808
+ readonly name: string;
1809
+ readonly graderTarget?: string;
1810
+ readonly workers?: number;
1811
+ readonly providerBatching?: boolean;
1812
+ readonly config: AgentVResolvedConfig;
1693
1813
  } | {
1694
1814
  readonly kind: 'cli';
1695
1815
  readonly name: string;
1696
- readonly judgeTarget?: string;
1816
+ readonly graderTarget?: string;
1697
1817
  readonly workers?: number;
1698
1818
  readonly providerBatching?: boolean;
1699
1819
  readonly config: CliResolvedConfig;
@@ -1845,7 +1965,7 @@ declare function resolveAndCreateProvider(definition: TargetDefinition, env?: En
1845
1965
 
1846
1966
  /**
1847
1967
  * Function to resolve a target name to a provider.
1848
- * Used by code judges to support target override.
1968
+ * Used by code graders to support target override.
1849
1969
  */
1850
1970
  type TargetResolver = (targetName: string) => Provider | undefined;
1851
1971
  interface EvaluationContext {
@@ -1861,6 +1981,8 @@ interface EvaluationContext {
1861
1981
  readonly chatPrompt?: ChatPrompt;
1862
1982
  };
1863
1983
  readonly now: Date;
1984
+ readonly graderProvider?: Provider;
1985
+ /** @deprecated Use `graderProvider` instead */
1864
1986
  readonly judgeProvider?: Provider;
1865
1987
  readonly evaluatorTemplateOverride?: string;
1866
1988
  readonly evaluator?: EvaluatorConfig;
@@ -1878,9 +2000,9 @@ interface EvaluationContext {
1878
2000
  readonly startTime?: string;
1879
2001
  /** ISO 8601 timestamp when execution ended */
1880
2002
  readonly endTime?: string;
1881
- /** Resolver for target override in code judges */
2003
+ /** Resolver for target override in code graders */
1882
2004
  readonly targetResolver?: TargetResolver;
1883
- /** List of available target names for code judges */
2005
+ /** List of available target names for code graders */
1884
2006
  readonly availableTargets?: readonly string[];
1885
2007
  /** Unified diff of file changes from workspace (when workspace_template is configured) */
1886
2008
  readonly fileChanges?: string;
@@ -1896,7 +2018,7 @@ interface EvaluationScore {
1896
2018
  readonly reasoning?: string;
1897
2019
  readonly evaluatorRawRequest?: JsonObject;
1898
2020
  readonly scores?: readonly ChildEvaluatorResult[];
1899
- /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
2021
+ /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */
1900
2022
  readonly details?: JsonObject;
1901
2023
  /** Token usage from LLM calls made by this evaluator (optional). */
1902
2024
  readonly tokenUsage?: TokenUsage;
@@ -1912,7 +2034,7 @@ interface ChildEvaluatorResult {
1912
2034
  readonly reasoning?: string;
1913
2035
  readonly evaluatorRawRequest?: JsonObject;
1914
2036
  readonly scores?: readonly ChildEvaluatorResult[];
1915
- /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
2037
+ /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */
1916
2038
  readonly details?: JsonObject;
1917
2039
  /** Token usage from LLM calls made by this evaluator (optional). */
1918
2040
  readonly tokenUsage?: TokenUsage;
@@ -2071,12 +2193,18 @@ declare class LatencyEvaluator implements Evaluator {
2071
2193
  * Custom evaluators can override this via evaluatorTemplate option.
2072
2194
  */
2073
2195
  declare const DEFAULT_EVALUATOR_TEMPLATE: string;
2074
- type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
2075
- interface LlmJudgeEvaluatorOptions {
2076
- readonly resolveJudgeProvider: JudgeProviderResolver;
2196
+ type GraderProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
2197
+ interface LlmGraderEvaluatorOptions {
2198
+ readonly resolveGraderProvider: GraderProviderResolver;
2199
+ /** @deprecated Use `resolveGraderProvider` instead. */
2200
+ readonly resolveJudgeProvider?: GraderProviderResolver;
2077
2201
  readonly maxOutputTokens?: number;
2078
2202
  readonly temperature?: number;
2079
2203
  readonly evaluatorTemplate?: string;
2204
+ readonly maxSteps?: number;
2205
+ readonly graderTargetProvider?: Provider;
2206
+ /** @deprecated Use `graderTargetProvider` instead. */
2207
+ readonly judgeTargetProvider?: Provider;
2080
2208
  }
2081
2209
  declare const freeformEvaluationSchema: z.ZodObject<{
2082
2210
  score: z.ZodNumber;
@@ -2125,13 +2253,15 @@ declare const rubricEvaluationSchema: z.ZodObject<{
2125
2253
  overall_reasoning: string;
2126
2254
  }>;
2127
2255
 
2128
- declare class LlmJudgeEvaluator implements Evaluator {
2129
- readonly kind = "llm-judge";
2130
- private readonly resolveJudgeProvider;
2256
+ declare class LlmGraderEvaluator implements Evaluator {
2257
+ readonly kind = "llm-grader";
2258
+ private readonly resolveGraderProvider;
2131
2259
  private readonly maxOutputTokens?;
2132
2260
  private readonly temperature?;
2133
2261
  private readonly evaluatorTemplate?;
2134
- constructor(options: LlmJudgeEvaluatorOptions);
2262
+ private readonly maxSteps;
2263
+ private readonly graderTargetProvider?;
2264
+ constructor(options: LlmGraderEvaluatorOptions);
2135
2265
  evaluate(context: EvaluationContext): Promise<EvaluationScore>;
2136
2266
  private evaluateFreeform;
2137
2267
  private evaluateWithRubrics;
@@ -2140,6 +2270,43 @@ declare class LlmJudgeEvaluator implements Evaluator {
2140
2270
  * Each criterion is scored 0-10 and normalized to 0-1.
2141
2271
  */
2142
2272
  private evaluateWithScoreRanges;
2273
+ /**
2274
+ * Built-in mode: Uses Vercel AI SDK generateText() with sandboxed filesystem tools.
2275
+ */
2276
+ private evaluateBuiltIn;
2277
+ /**
2278
+ * Grader target mode: Delegates to an explicit graderTargetProvider via Provider.invoke().
2279
+ */
2280
+ private evaluateWithGraderTarget;
2281
+ /**
2282
+ * Delegate mode: resolved provider is an agent provider — send prompt via invoke().
2283
+ */
2284
+ private evaluateWithDelegatedAgent;
2285
+ /**
2286
+ * Shared implementation for grader_target and delegate modes.
2287
+ * Both invoke a provider and parse the agent result from the response.
2288
+ */
2289
+ private evaluateWithDelegate;
2290
+ /**
2291
+ * Build system prompt for built-in agent mode.
2292
+ * Includes output format instructions.
2293
+ */
2294
+ private buildAgentSystemPrompt;
2295
+ /**
2296
+ * Build user prompt for built-in agent mode.
2297
+ * Uses custom template if provided, otherwise builds default prompt.
2298
+ */
2299
+ private buildAgentUserPrompt;
2300
+ /**
2301
+ * Build the full evaluation prompt for delegate mode (agent providers).
2302
+ * Combines task context, criteria, candidate info, and output format instructions.
2303
+ */
2304
+ private buildDelegatedPrompt;
2305
+ /**
2306
+ * Parse the agent's response text into an EvaluationScore.
2307
+ * Supports both freeform and rubric modes.
2308
+ */
2309
+ private parseAgentResult;
2143
2310
  /**
2144
2311
  * Build prompt for score-range rubric evaluation.
2145
2312
  */
@@ -2165,67 +2332,40 @@ declare function calculateRubricScore(result: z.infer<typeof rubricEvaluationSch
2165
2332
  */
2166
2333
  declare function buildScoreRangeOutputSchema(): string;
2167
2334
 
2168
- interface AgentJudgeEvaluatorOptions {
2169
- readonly resolveJudgeProvider: (ctx: EvaluationContext) => Promise<Provider | undefined>;
2170
- readonly maxSteps?: number;
2171
- readonly temperature?: number;
2172
- readonly evaluatorTemplate?: string;
2173
- readonly judgeTargetProvider?: Provider;
2174
- }
2175
- declare class AgentJudgeEvaluator implements Evaluator {
2176
- readonly kind = "agent-judge";
2177
- private readonly resolveJudgeProvider;
2178
- private readonly maxSteps;
2179
- private readonly temperature;
2180
- private readonly evaluatorTemplate?;
2181
- private readonly judgeTargetProvider?;
2182
- constructor(options: AgentJudgeEvaluatorOptions);
2183
- evaluate(context: EvaluationContext): Promise<EvaluationScore>;
2184
- /**
2185
- * Built-in mode: Uses Vercel AI SDK generateText() with sandboxed filesystem tools.
2186
- */
2187
- private evaluateBuiltIn;
2188
- /**
2189
- * Judge target mode: Delegates to an external agent provider via Provider.invoke().
2190
- */
2191
- private evaluateWithJudgeTarget;
2192
- /**
2193
- * Parse the agent's response text into an EvaluationScore.
2194
- * Supports both freeform and rubric modes.
2195
- */
2196
- private parseResult;
2197
- /**
2198
- * Build system prompt for built-in mode.
2199
- * Includes output format instructions.
2200
- */
2201
- private buildSystemPrompt;
2202
- /**
2203
- * Build user prompt for built-in mode.
2204
- * Uses custom template if provided, otherwise builds default prompt.
2205
- */
2206
- private buildUserPrompt;
2207
- /**
2208
- * Build the full evaluation prompt for judge target mode (delegation).
2209
- * Combines task context, criteria, candidate info, and output format instructions.
2210
- */
2211
- private buildDelegatedPrompt;
2335
+ /**
2336
+ * Built-in skill-trigger evaluator.
2337
+ *
2338
+ * Detects whether the agent invoked a named Claude Code skill as its first tool call.
2339
+ * Mirrors the post-hoc fallback detection in skill-creator's run_eval.py:
2340
+ * - Only the FIRST tool call matters.
2341
+ * - Skill tool: checks input.skill contains the skill name (case-sensitive substring).
2342
+ * - Read tool: checks input.file_path contains the skill name (case-sensitive substring).
2343
+ * - Any other tool as first call means the skill was not triggered.
2344
+ * - Supports negative cases via should_trigger: false.
2345
+ */
2346
+
2347
+ declare class SkillTriggerEvaluator implements Evaluator {
2348
+ readonly kind = "skill-trigger";
2349
+ private readonly config;
2350
+ constructor(config: SkillTriggerEvaluatorConfig);
2351
+ evaluate(context: EvaluationContext): EvaluationScore;
2212
2352
  }
2213
2353
 
2214
- interface LlmJudgePromptAssembly {
2354
+ interface LlmGraderPromptAssembly {
2215
2355
  systemPrompt: string;
2216
2356
  userPrompt: string;
2217
2357
  responseSchema: string;
2218
2358
  mode: 'freeform' | 'checklist' | 'score_range';
2219
2359
  }
2220
- declare function assembleLlmJudgePrompt(input: {
2360
+ declare function assembleLlmGraderPrompt(input: {
2221
2361
  evalCase: EvalTest;
2222
2362
  candidate: string;
2223
2363
  promptInputs: PromptInputs;
2224
- evaluatorConfig?: LlmJudgeEvaluatorConfig;
2364
+ evaluatorConfig?: LlmGraderEvaluatorConfig;
2225
2365
  output?: readonly Message[];
2226
2366
  fileChanges?: string;
2227
2367
  evaluatorTemplateOverride?: string;
2228
- }): LlmJudgePromptAssembly;
2368
+ }): LlmGraderPromptAssembly;
2229
2369
 
2230
2370
  interface TokenUsageEvaluatorOptions {
2231
2371
  readonly config: TokenUsageEvaluatorConfig;
@@ -2322,18 +2462,22 @@ declare function runEqualsAssertion(output: string, value: string): AssertionRes
2322
2462
  * Contains shared resources needed by evaluator instances.
2323
2463
  */
2324
2464
  interface EvaluatorDispatchContext {
2325
- /** Shared LLM judge provider (resolved at suite level) */
2465
+ /** Shared LLM grader provider (resolved at suite level) */
2466
+ readonly graderProvider?: Provider;
2467
+ /** @deprecated Use `graderProvider` instead */
2326
2468
  readonly judgeProvider?: Provider;
2327
2469
  /** Function to resolve target names to providers */
2328
2470
  readonly targetResolver?: TargetResolver;
2329
- /** Available target names for code judges */
2471
+ /** Available target names for code graders */
2330
2472
  readonly availableTargets?: readonly string[];
2331
2473
  /** Agent timeout in ms */
2332
2474
  readonly agentTimeoutMs?: number;
2333
2475
  /** Directory containing the eval file (for composite member resolution) */
2334
2476
  readonly evalFileDir?: string;
2335
- /** Shared LLM judge evaluator instance */
2336
- readonly llmJudge: Evaluator;
2477
+ /** Shared LLM grader evaluator instance */
2478
+ readonly llmGrader: Evaluator;
2479
+ /** @deprecated Use `llmGrader` instead */
2480
+ readonly llmJudge?: Evaluator;
2337
2481
  /** Reference to the registry itself (for composite evaluators that need to create children) */
2338
2482
  readonly registry: EvaluatorRegistry;
2339
2483
  }
@@ -2341,8 +2485,8 @@ interface EvaluatorDispatchContext {
2341
2485
  * Factory function that creates an Evaluator instance from a config.
2342
2486
  *
2343
2487
  * Factory functions handle all type-specific initialization logic:
2344
- * - Reading prompt files for LLM judges
2345
- * - Resolving script paths for code judges
2488
+ * - Reading prompt files for LLM graders
2489
+ * - Resolving script paths for code graders
2346
2490
  * - Creating adapter evaluators for deterministic assertions
2347
2491
  */
2348
2492
  type EvaluatorFactoryFn = (config: EvaluatorConfig, context: EvaluatorDispatchContext) => Evaluator | Promise<Evaluator>;
@@ -2404,7 +2548,7 @@ interface RunEvalCaseOptions {
2404
2548
  readonly provider: Provider;
2405
2549
  readonly target: ResolvedTarget;
2406
2550
  readonly evaluators: Partial<Record<string, Evaluator>> & {
2407
- readonly 'llm-judge': Evaluator;
2551
+ readonly 'llm-grader': Evaluator;
2408
2552
  };
2409
2553
  readonly now?: () => Date;
2410
2554
  readonly maxRetries?: number;
@@ -2412,10 +2556,10 @@ interface RunEvalCaseOptions {
2412
2556
  readonly cache?: EvaluationCache;
2413
2557
  readonly useCache?: boolean;
2414
2558
  readonly signal?: AbortSignal;
2415
- readonly judgeProvider?: Provider;
2416
- /** Resolver for target override in code judges */
2559
+ readonly graderProvider?: Provider;
2560
+ /** Resolver for target override in code graders */
2417
2561
  readonly targetResolver?: (name: string) => Provider | undefined;
2418
- /** List of available target names for code judges */
2562
+ /** List of available target names for code graders */
2419
2563
  readonly availableTargets?: readonly string[];
2420
2564
  /** Unique identifier for the evaluation run (used for workspace management) */
2421
2565
  readonly evalRunId?: string;
@@ -2498,10 +2642,44 @@ interface RunEvaluationOptions {
2498
2642
  readonly retainOnSuccess?: 'keep' | 'cleanup';
2499
2643
  /** Retention policy override for failed cases */
2500
2644
  readonly retainOnFailure?: 'keep' | 'cleanup';
2645
+ /** CLI override: grader target name (e.g., "agentv" or a target from targets.yaml) */
2646
+ readonly graderTarget?: string;
2647
+ /** CLI override: model for grader target (e.g., "openai:gpt-5-mini") */
2648
+ readonly model?: string;
2501
2649
  }
2502
2650
  declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
2503
2651
  declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
2504
2652
 
2653
+ /**
2654
+ * Types for inline assertion functions used in the evaluate() API.
2655
+ *
2656
+ * Inline functions are the escape hatch for custom evaluation logic
2657
+ * that doesn't fit a built-in evaluator type. For built-in assertions
2658
+ * (contains, regex, is-json, etc.), use config objects instead:
2659
+ *
2660
+ * assert: [{ type: 'contains', value: 'hello' }]
2661
+ *
2662
+ * Inline functions are for custom logic:
2663
+ *
2664
+ * assert: [({ output }) => ({ name: 'len', score: output.length > 5 ? 1 : 0 })]
2665
+ */
2666
+ /** Context passed to inline assertion functions */
2667
+ interface AssertContext {
2668
+ readonly input: string;
2669
+ readonly output: string;
2670
+ readonly expectedOutput?: string;
2671
+ readonly criteria?: string;
2672
+ readonly metadata?: Record<string, unknown>;
2673
+ }
2674
+ /** Result from an inline assertion function */
2675
+ interface AssertResult {
2676
+ readonly name: string;
2677
+ readonly score: number;
2678
+ readonly metadata?: Record<string, unknown>;
2679
+ }
2680
+ /** Inline assertion function signature */
2681
+ type AssertFn = (ctx: AssertContext) => AssertResult | Promise<AssertResult>;
2682
+
2505
2683
  /**
2506
2684
  * Programmatic API for running evaluations.
2507
2685
  *
@@ -2509,7 +2687,7 @@ declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationRes
2509
2687
  * instead of a CLI. The config shape mirrors the YAML structure for easy
2510
2688
  * translation between file-based and programmatic usage.
2511
2689
  *
2512
- * @example Inline tests
2690
+ * @example Inline tests with config objects
2513
2691
  * ```typescript
2514
2692
  * import { evaluate } from '@agentv/core';
2515
2693
  *
@@ -2518,7 +2696,7 @@ declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationRes
2518
2696
  * {
2519
2697
  * id: 'capital',
2520
2698
  * input: 'What is the capital of France?',
2521
- * expected_output: 'Paris',
2699
+ * expectedOutput: 'Paris',
2522
2700
  * assert: [{ type: 'contains', value: 'Paris' }],
2523
2701
  * },
2524
2702
  * ],
@@ -2528,6 +2706,27 @@ declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationRes
2528
2706
  * console.log(results.summary.passed, 'passed');
2529
2707
  * ```
2530
2708
  *
2709
+ * @example Inline tests with task function and custom assertion
2710
+ * ```typescript
2711
+ * import { evaluate } from '@agentv/core';
2712
+ *
2713
+ * const { summary } = await evaluate({
2714
+ * tests: [
2715
+ * {
2716
+ * id: 'echo',
2717
+ * input: 'hello',
2718
+ * expectedOutput: 'Echo: hello',
2719
+ * assert: [
2720
+ * { type: 'contains', value: 'hello' },
2721
+ * { type: 'equals' },
2722
+ * ({ output }) => ({ name: 'custom', score: output.length > 0 ? 1 : 0 }),
2723
+ * ],
2724
+ * },
2725
+ * ],
2726
+ * task: async (input) => `Echo: ${input}`,
2727
+ * });
2728
+ * ```
2729
+ *
2531
2730
  * @example File-based
2532
2731
  * ```typescript
2533
2732
  * const results = await evaluate({
@@ -2553,10 +2752,12 @@ interface EvalTestInput {
2553
2752
  role: string;
2554
2753
  content: string;
2555
2754
  }[];
2556
- /** Expected reference output */
2755
+ /** Expected reference output (camelCase preferred) */
2756
+ readonly expectedOutput?: string;
2757
+ /** @deprecated Use `expectedOutput` instead */
2557
2758
  readonly expected_output?: string;
2558
- /** Assertion evaluators */
2559
- readonly assert?: readonly EvalAssertionInput[];
2759
+ /** Assertion evaluators — accepts factory functions, config objects, or inline functions */
2760
+ readonly assert?: readonly AssertEntry[];
2560
2761
  /** Arbitrary metadata */
2561
2762
  readonly metadata?: Record<string, unknown>;
2562
2763
  }
@@ -2592,6 +2793,8 @@ interface EvalAssertionInput {
2592
2793
  /** Additional properties */
2593
2794
  readonly [key: string]: unknown;
2594
2795
  }
2796
+ /** Assert entry: inline function or config object */
2797
+ type AssertEntry = AssertFn | EvalAssertionInput;
2595
2798
  /**
2596
2799
  * Configuration for `evaluate()`.
2597
2800
  * Accepts either inline tests or a spec file path.
@@ -2603,8 +2806,10 @@ interface EvalConfig {
2603
2806
  readonly specFile?: string;
2604
2807
  /** Target provider configuration */
2605
2808
  readonly target?: TargetDefinition;
2809
+ /** Custom task function — mutually exclusive with target */
2810
+ readonly task?: (input: string) => string | Promise<string>;
2606
2811
  /** Suite-level assertions applied to all tests */
2607
- readonly assert?: readonly EvalAssertionInput[];
2812
+ readonly assert?: readonly AssertEntry[];
2608
2813
  /** Filter tests by ID pattern (glob supported) */
2609
2814
  readonly filter?: string;
2610
2815
  /** Maximum concurrent workers (default: 3) */
@@ -3298,9 +3503,29 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
3298
3503
  */
3299
3504
  declare function discoverAssertions(registry: EvaluatorRegistry, baseDir: string): Promise<string[]>;
3300
3505
 
3506
+ /**
3507
+ * Convention-based discovery of custom grader scripts.
3508
+ *
3509
+ * Scans `.agentv/graders/` (and legacy `.agentv/judges/`) for TypeScript/JavaScript
3510
+ * files and registers them as code-grader evaluators in the registry. The file name
3511
+ * (without extension) becomes the evaluator type name.
3512
+ *
3513
+ * Example: `.agentv/graders/custom-grader.ts` → type "custom-grader" in EVAL.yaml
3514
+ */
3515
+
3516
+ /**
3517
+ * Discover custom grader scripts from `.agentv/graders/` (and legacy `.agentv/judges/`)
3518
+ * and register them as evaluator types in the registry.
3519
+ *
3520
+ * @param registry - The evaluator registry to register discovered graders into
3521
+ * @param baseDir - The base directory to search from (typically project root or eval file dir)
3522
+ * @returns Names of discovered grader types
3523
+ */
3524
+ declare function discoverGraders(registry: EvaluatorRegistry, baseDir: string): Promise<string[]>;
3525
+
3301
3526
  type AgentKernel = {
3302
3527
  status: string;
3303
3528
  };
3304
3529
  declare function createAgentKernel(): AgentKernel;
3305
3530
 
3306
- export { type AcquireWorkspaceOptions, AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getHitCount, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
3531
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getHitCount, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };