@agentv/core 2.18.4 → 3.0.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -8,12 +8,17 @@ interface ChatMessage {
8
8
  readonly name?: string;
9
9
  }
10
10
  type ChatPrompt = readonly ChatMessage[];
11
- type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-sdk' | 'copilot-cli' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
11
+ type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-sdk' | 'copilot-cli' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude' | 'claude-cli' | 'claude-sdk' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders' | 'agentv';
12
12
  /** Callbacks for real-time observability during provider execution */
13
13
  interface ProviderStreamCallbacks {
14
14
  onToolCallStart?: (toolName: string, toolCallId?: string) => void;
15
15
  onToolCallEnd?: (toolName: string, input: unknown, output: unknown, durationMs: number, toolCallId?: string) => void;
16
16
  onLlmCallEnd?: (model: string, tokenUsage?: ProviderTokenUsage) => void;
17
+ /** Returns active OTel span IDs for Braintrust trace bridging (optional) */
18
+ getActiveSpanIds?: () => {
19
+ parentSpanId: string;
20
+ rootSpanId: string;
21
+ } | null;
17
22
  }
18
23
  interface ProviderRequest {
19
24
  readonly question: string;
@@ -36,6 +41,11 @@ interface ProviderRequest {
36
41
  readonly captureFileChanges?: boolean;
37
42
  /** Real-time observability callbacks (optional) */
38
43
  readonly streamCallbacks?: ProviderStreamCallbacks;
44
+ /** Braintrust span IDs for trace-claude-code plugin (optional) */
45
+ readonly braintrustSpanIds?: {
46
+ readonly parentSpanId: string;
47
+ readonly rootSpanId: string;
48
+ };
39
49
  }
40
50
  /**
41
51
  * A tool call within an output message.
@@ -134,6 +144,8 @@ type EnvLookup = Readonly<Record<string, string | undefined>>;
134
144
  interface TargetDefinition {
135
145
  readonly name: string;
136
146
  readonly provider: ProviderKind | string;
147
+ readonly grader_target?: string | undefined;
148
+ /** @deprecated Use `grader_target` instead */
137
149
  readonly judge_target?: string | undefined;
138
150
  readonly workers?: number | undefined;
139
151
  readonly provider_batching?: boolean | undefined;
@@ -453,11 +465,11 @@ declare function isJsonValue(value: unknown): value is JsonValue;
453
465
  * - Either content (string or array of objects) OR tool_calls (for assistant messages)
454
466
  */
455
467
  declare function isTestMessage(value: unknown): value is TestMessage;
456
- declare const EVALUATOR_KIND_VALUES: readonly ["code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "agent-judge", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics"];
468
+ declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
457
469
  type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
458
470
  declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
459
471
  /**
460
- * Configuration for enabling target access in code-judge evaluators.
472
+ * Configuration for enabling target access in code-grader evaluators.
461
473
  * When present, the runtime will start a local proxy server that allows
462
474
  * the script to invoke configured targets without direct credential access.
463
475
  */
@@ -556,7 +568,7 @@ type WorkspaceConfig = {
556
568
  };
557
569
  type CodeEvaluatorConfig = {
558
570
  readonly name: string;
559
- readonly type: 'code-judge';
571
+ readonly type: 'code-judge' | 'code-grader';
560
572
  readonly command: readonly string[];
561
573
  /** @deprecated Use `command` instead */
562
574
  readonly script?: readonly string[];
@@ -567,14 +579,14 @@ type CodeEvaluatorConfig = {
567
579
  readonly required?: boolean | number;
568
580
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
569
581
  readonly negate?: boolean;
570
- /** Pass-through configuration for the code-judge (any unrecognized YAML properties) */
582
+ /** Pass-through configuration for the code-grader (any unrecognized YAML properties) */
571
583
  readonly config?: JsonObject;
572
584
  /** When present, enables target access via local proxy */
573
585
  readonly target?: TargetAccessConfig;
574
586
  };
575
587
  /**
576
588
  * Executable prompt template configuration.
577
- * Matches code-judge pattern for consistency.
589
+ * Matches code-grader pattern for consistency.
578
590
  */
579
591
  type PromptScriptConfig = {
580
592
  /** Command array to execute (e.g., ["bun", "run", "template.ts"]) */
@@ -584,24 +596,32 @@ type PromptScriptConfig = {
584
596
  /** Pass-through configuration for the prompt template */
585
597
  readonly config?: Record<string, unknown>;
586
598
  };
587
- type LlmJudgeEvaluatorConfig = {
599
+ type LlmGraderEvaluatorConfig = {
588
600
  readonly name: string;
589
- readonly type: 'llm-judge';
601
+ readonly type: 'llm-grader' | 'llm-judge';
590
602
  /** Text prompt (inline or file path) or executable script config */
591
603
  readonly prompt?: string | PromptScriptConfig;
592
604
  readonly promptPath?: string;
593
605
  /** Resolved absolute path for prompt file (used for text template prompts) */
594
606
  readonly resolvedPromptPath?: string;
595
- /** Resolved script array for executable prompts (matches code-judge pattern) */
607
+ /** Resolved script array for executable prompts (matches code-grader pattern) */
596
608
  readonly resolvedPromptScript?: readonly string[];
597
609
  readonly rubrics?: readonly RubricItem[];
598
610
  readonly weight?: number;
599
611
  readonly required?: boolean | number;
600
612
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
601
613
  readonly negate?: boolean;
614
+ /** Optional target override for this grader (uses a named LLM target from targets.yaml). */
615
+ readonly target?: string;
602
616
  /** Pass-through configuration for custom evaluator prompts (legacy, prefer prompt.config) */
603
617
  readonly config?: Record<string, unknown>;
618
+ /** Maximum agent steps for agentv built-in mode (default 10, max 50). Ignored in LLM mode. */
619
+ readonly max_steps?: number;
620
+ /** Temperature override for grader calls */
621
+ readonly temperature?: number;
604
622
  };
623
+ /** @deprecated Use `LlmGraderEvaluatorConfig` instead */
624
+ type LlmJudgeEvaluatorConfig = LlmGraderEvaluatorConfig;
605
625
  /**
606
626
  * Score range definition for analytic rubric scoring.
607
627
  * Each range maps an integer score band (0-10) to an outcome description.
@@ -613,7 +633,7 @@ type ScoreRange = {
613
633
  readonly outcome: string;
614
634
  };
615
635
  /**
616
- * Rubric item for LLM judge evaluation.
636
+ * Rubric item for LLM grader evaluation.
617
637
  * Supports two modes:
618
638
  * - Checklist mode: boolean satisfied/not-satisfied with `outcome`
619
639
  * - Score-range mode: 0-10 integer scoring with `score_ranges`
@@ -638,7 +658,7 @@ type RubricItem = {
638
658
  readonly required_min_score?: number;
639
659
  /**
640
660
  * Score range definitions for analytic rubric scoring.
641
- * When present, the judge outputs an integer 0-10 score per criterion.
661
+ * When present, the grader outputs an integer 0-10 score per criterion.
642
662
  * Ranges must be non-overlapping and cover 0-10 inclusive.
643
663
  */
644
664
  readonly score_ranges?: readonly ScoreRange[];
@@ -646,10 +666,19 @@ type RubricItem = {
646
666
  type CompositeAggregatorConfig = {
647
667
  readonly type: 'weighted_average';
648
668
  readonly weights?: Record<string, number>;
669
+ } | {
670
+ readonly type: 'code-grader';
671
+ readonly path: string;
672
+ readonly cwd?: string;
649
673
  } | {
650
674
  readonly type: 'code-judge';
651
675
  readonly path: string;
652
676
  readonly cwd?: string;
677
+ } | {
678
+ readonly type: 'llm-grader';
679
+ readonly prompt?: string;
680
+ readonly promptPath?: string;
681
+ readonly model?: string;
653
682
  } | {
654
683
  readonly type: 'llm-judge';
655
684
  readonly prompt?: string;
@@ -662,7 +691,7 @@ type CompositeAggregatorConfig = {
662
691
  type CompositeEvaluatorConfig = {
663
692
  readonly name: string;
664
693
  readonly type: 'composite';
665
- readonly evaluators: readonly EvaluatorConfig[];
694
+ readonly assertions: readonly EvaluatorConfig[];
666
695
  readonly aggregator: CompositeAggregatorConfig;
667
696
  readonly weight?: number;
668
697
  readonly required?: boolean | number;
@@ -671,7 +700,7 @@ type CompositeEvaluatorConfig = {
671
700
  };
672
701
  /**
673
702
  * Match type for field accuracy evaluation.
674
- * Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code-judge evaluator.
703
+ * Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code-grader evaluator.
675
704
  * See examples/features/document-extraction/fuzzy_match.ts for an example.
676
705
  */
677
706
  type FieldMatchType = 'exact' | 'numeric_tolerance' | 'date';
@@ -786,34 +815,6 @@ type ExecutionMetricsEvaluatorConfig = {
786
815
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
787
816
  readonly negate?: boolean;
788
817
  };
789
- /**
790
- * Configuration for the agent-judge evaluator.
791
- * Runs an agentic investigation loop to audit workspaces and verify criteria.
792
- * Two modes:
793
- * - Built-in: Uses AI SDK generateText() with sandboxed filesystem tools
794
- * - Judge target: Delegates to an external agent provider via Provider.invoke()
795
- */
796
- type AgentJudgeEvaluatorConfig = {
797
- readonly name: string;
798
- readonly type: 'agent-judge';
799
- /** Custom evaluation prompt (inline text or file path) */
800
- readonly prompt?: string;
801
- readonly promptPath?: string;
802
- /** Resolved absolute path for prompt file */
803
- readonly resolvedPromptPath?: string;
804
- /** Rubric items for structured evaluation (reuses llm-judge rubric infra) */
805
- readonly rubrics?: readonly RubricItem[];
806
- /** Maximum agent steps for built-in mode (default 10, max 50) */
807
- readonly max_steps?: number;
808
- /** Temperature for built-in mode (default 0) */
809
- readonly temperature?: number;
810
- /** Target name — delegates agent loop to this provider instead of built-in mode */
811
- readonly target?: string;
812
- readonly weight?: number;
813
- readonly required?: boolean | number;
814
- /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
815
- readonly negate?: boolean;
816
- };
817
818
  /**
818
819
  * Configuration for the contains assertion evaluator.
819
820
  * Checks whether the candidate output contains a specified substring.
@@ -971,7 +972,34 @@ type RubricsEvaluatorConfig = {
971
972
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
972
973
  readonly negate?: boolean;
973
974
  };
974
- type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | AgentJudgeEvaluatorConfig | ContainsEvaluatorConfig | ContainsAnyEvaluatorConfig | ContainsAllEvaluatorConfig | IcontainsEvaluatorConfig | IcontainsAnyEvaluatorConfig | IcontainsAllEvaluatorConfig | StartsWithEvaluatorConfig | EndsWithEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig;
975
+ /**
976
+ * Configuration for the skill-trigger evaluator.
977
+ * Detects whether the agent invoked a named Claude Code skill as its first tool call.
978
+ * Mirrors the post-hoc fallback detection in skill-creator's run_eval.py.
979
+ */
980
+ type SkillTriggerEvaluatorConfig = {
981
+ readonly name: string;
982
+ readonly type: 'skill-trigger';
983
+ /** The skill name to check for (case-sensitive substring match) */
984
+ readonly skill: string;
985
+ /** Whether the skill is expected to trigger (default: true) */
986
+ readonly should_trigger?: boolean;
987
+ readonly weight?: number;
988
+ readonly required?: boolean | number;
989
+ readonly negate?: boolean;
990
+ };
991
+ /**
992
+ * Configuration for the inline-assert evaluator.
993
+ * Wraps an AssertFn for in-process evaluation via the evaluate() API.
994
+ */
995
+ type InlineAssertEvaluatorConfig = {
996
+ readonly name: string;
997
+ readonly type: 'inline-assert';
998
+ readonly weight?: number;
999
+ readonly required?: boolean | number;
1000
+ readonly negate?: boolean;
1001
+ };
1002
+ type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | SkillTriggerEvaluatorConfig | ContainsEvaluatorConfig | ContainsAnyEvaluatorConfig | ContainsAllEvaluatorConfig | IcontainsEvaluatorConfig | IcontainsAnyEvaluatorConfig | IcontainsAllEvaluatorConfig | StartsWithEvaluatorConfig | EndsWithEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig | InlineAssertEvaluatorConfig;
975
1003
  /**
976
1004
  * Eval test definition sourced from AgentV specs.
977
1005
  */
@@ -989,7 +1017,7 @@ interface EvalTest {
989
1017
  readonly file_paths: readonly string[];
990
1018
  readonly criteria: string;
991
1019
  readonly evaluator?: EvaluatorKind;
992
- readonly evaluators?: readonly EvaluatorConfig[];
1020
+ readonly assertions?: readonly EvaluatorConfig[];
993
1021
  /** Workspace configuration (merged from suite-level and case-level) */
994
1022
  readonly workspace?: WorkspaceConfig;
995
1023
  /** Arbitrary metadata passed to workspace scripts via stdin */
@@ -1162,15 +1190,15 @@ interface EvaluatorResult {
1162
1190
  readonly rawRequest?: JsonObject;
1163
1191
  readonly evaluatorProviderRequest?: JsonObject;
1164
1192
  readonly scores?: readonly EvaluatorResult[];
1165
- /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts). */
1193
+ /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts). */
1166
1194
  readonly details?: JsonObject;
1167
1195
  /** Token usage from LLM calls made by this evaluator (optional). */
1168
1196
  readonly tokenUsage?: TokenUsage;
1169
- /** Wall-clock duration of this judge execution in milliseconds. */
1197
+ /** Wall-clock duration of this grader execution in milliseconds. */
1170
1198
  readonly durationMs?: number;
1171
- /** ISO 8601 UTC timestamp when this judge started executing. */
1199
+ /** ISO 8601 UTC timestamp when this grader started executing. */
1172
1200
  readonly startedAt?: string;
1173
- /** ISO 8601 UTC timestamp when this judge finished executing. */
1201
+ /** ISO 8601 UTC timestamp when this grader finished executing. */
1174
1202
  readonly endedAt?: string;
1175
1203
  }
1176
1204
  /**
@@ -1304,7 +1332,7 @@ declare function buildPromptInputs(testCase: EvalTest, mode?: FormattingMode): P
1304
1332
  /**
1305
1333
  * Detect file format by extension.
1306
1334
  */
1307
- declare function detectFormat(filePath: string): 'yaml' | 'jsonl';
1335
+ declare function detectFormat(filePath: string): 'yaml' | 'jsonl' | 'agent-skills-json';
1308
1336
 
1309
1337
  type LoadOptions = {
1310
1338
  readonly verbose?: boolean;
@@ -1356,6 +1384,83 @@ declare function loadTestById(evalFilePath: string, repoRoot: URL | string, eval
1356
1384
  /** @deprecated Use `loadTestById` instead */
1357
1385
  declare const loadEvalCaseById: typeof loadTestById;
1358
1386
 
1387
+ /**
1388
+ * Raw Agent Skills evals.json schema.
1389
+ * @see https://agentskills.io/skill-creation/evaluating-skills
1390
+ */
1391
+ interface AgentSkillsEvalsFile {
1392
+ readonly skill_name?: string;
1393
+ readonly evals: readonly AgentSkillsEvalCase[];
1394
+ }
1395
+ interface AgentSkillsEvalCase {
1396
+ readonly id: number;
1397
+ readonly prompt: string;
1398
+ readonly expected_output?: string;
1399
+ readonly files?: readonly string[];
1400
+ readonly assertions?: readonly string[];
1401
+ }
1402
+ /**
1403
+ * Detect whether a JSON file is in Agent Skills evals.json format.
1404
+ * Returns true if the parsed content has an `evals` array.
1405
+ */
1406
+ declare function isAgentSkillsFormat(parsed: unknown): parsed is AgentSkillsEvalsFile;
1407
+ /**
1408
+ * Parse already-loaded Agent Skills evals data into EvalTest[].
1409
+ * Exported for testing without file I/O.
1410
+ */
1411
+ declare function parseAgentSkillsEvals(parsed: unknown, source?: string, baseDir?: string): readonly EvalTest[];
1412
+
1413
+ /**
1414
+ * EVAL.yaml → evals.json transpiler.
1415
+ *
1416
+ * Converts an AgentV EVAL.yaml file into Agent Skills evals.json format
1417
+ * for consumption by the skill-creator pipeline.
1418
+ *
1419
+ * Handles both `assertions:` (current) and `assert:` (deprecated alias).
1420
+ */
1421
+ interface EvalsJsonCase {
1422
+ id: number;
1423
+ prompt: string;
1424
+ expected_output?: string;
1425
+ files?: string[];
1426
+ should_trigger?: boolean;
1427
+ assertions: string[];
1428
+ }
1429
+ interface EvalsJsonFile {
1430
+ skill_name: string;
1431
+ evals: EvalsJsonCase[];
1432
+ }
1433
+ /**
1434
+ * Result of transpiling a single EVAL.yaml.
1435
+ * May produce multiple evals.json files (one per skill).
1436
+ */
1437
+ interface TranspileResult {
1438
+ /** Map from skill_name → EvalsJsonFile */
1439
+ files: Map<string, EvalsJsonFile>;
1440
+ /** Warning messages accumulated during transpilation */
1441
+ warnings: string[];
1442
+ }
1443
+ /**
1444
+ * Transpile a parsed EVAL.yaml object into one or more evals.json objects.
1445
+ *
1446
+ * @param suite Parsed YAML object (already loaded, no file I/O here)
1447
+ * @param source Source identifier for error messages (e.g. file path)
1448
+ */
1449
+ declare function transpileEvalYaml(suite: unknown, source?: string): TranspileResult;
1450
+ /**
1451
+ * Transpile an EVAL.yaml file into one or more evals.json objects.
1452
+ * Returns a map from output filename → JSON content.
1453
+ *
1454
+ * @param evalYamlPath Absolute path to the EVAL.yaml file
1455
+ */
1456
+ declare function transpileEvalYamlFile(evalYamlPath: string): TranspileResult;
1457
+ /**
1458
+ * Determine the output filename(s) for a transpile result.
1459
+ * Single skill → "evals.json"
1460
+ * Multiple skills → "<skill>.evals.json"
1461
+ */
1462
+ declare function getOutputFilenames(result: TranspileResult): Map<string, string>;
1463
+
1359
1464
  declare function fileExists(filePath: string): Promise<boolean>;
1360
1465
  /**
1361
1466
  * Normalize line endings to LF (\n).
@@ -1603,87 +1708,112 @@ interface VSCodeResolvedConfig {
1603
1708
  readonly workspaceTemplate?: string;
1604
1709
  readonly timeoutMs?: number;
1605
1710
  }
1711
+ interface AgentVResolvedConfig {
1712
+ readonly model: string;
1713
+ readonly temperature: number;
1714
+ }
1606
1715
  type ResolvedTarget = {
1607
1716
  readonly kind: 'azure';
1608
1717
  readonly name: string;
1609
- readonly judgeTarget?: string;
1718
+ readonly graderTarget?: string;
1610
1719
  readonly workers?: number;
1611
1720
  readonly providerBatching?: boolean;
1612
1721
  readonly config: AzureResolvedConfig;
1613
1722
  } | {
1614
1723
  readonly kind: 'anthropic';
1615
1724
  readonly name: string;
1616
- readonly judgeTarget?: string;
1725
+ readonly graderTarget?: string;
1617
1726
  readonly workers?: number;
1618
1727
  readonly providerBatching?: boolean;
1619
1728
  readonly config: AnthropicResolvedConfig;
1620
1729
  } | {
1621
1730
  readonly kind: 'gemini';
1622
1731
  readonly name: string;
1623
- readonly judgeTarget?: string;
1732
+ readonly graderTarget?: string;
1624
1733
  readonly workers?: number;
1625
1734
  readonly providerBatching?: boolean;
1626
1735
  readonly config: GeminiResolvedConfig;
1627
1736
  } | {
1628
1737
  readonly kind: 'codex';
1629
1738
  readonly name: string;
1630
- readonly judgeTarget?: string;
1739
+ readonly graderTarget?: string;
1631
1740
  readonly workers?: number;
1632
1741
  readonly providerBatching?: boolean;
1633
1742
  readonly config: CodexResolvedConfig;
1634
1743
  } | {
1635
1744
  readonly kind: 'copilot-sdk';
1636
1745
  readonly name: string;
1637
- readonly judgeTarget?: string;
1746
+ readonly graderTarget?: string;
1638
1747
  readonly workers?: number;
1639
1748
  readonly providerBatching?: boolean;
1640
1749
  readonly config: CopilotSdkResolvedConfig;
1641
1750
  } | {
1642
1751
  readonly kind: 'copilot-cli';
1643
1752
  readonly name: string;
1644
- readonly judgeTarget?: string;
1753
+ readonly graderTarget?: string;
1645
1754
  readonly workers?: number;
1646
1755
  readonly providerBatching?: boolean;
1647
1756
  readonly config: CopilotCliResolvedConfig;
1648
1757
  } | {
1649
1758
  readonly kind: 'pi-coding-agent';
1650
1759
  readonly name: string;
1651
- readonly judgeTarget?: string;
1760
+ readonly graderTarget?: string;
1652
1761
  readonly workers?: number;
1653
1762
  readonly providerBatching?: boolean;
1654
1763
  readonly config: PiCodingAgentResolvedConfig;
1655
1764
  } | {
1656
1765
  readonly kind: 'pi-agent-sdk';
1657
1766
  readonly name: string;
1658
- readonly judgeTarget?: string;
1767
+ readonly graderTarget?: string;
1659
1768
  readonly workers?: number;
1660
1769
  readonly providerBatching?: boolean;
1661
1770
  readonly config: PiAgentSdkResolvedConfig;
1662
1771
  } | {
1663
1772
  readonly kind: 'claude';
1664
1773
  readonly name: string;
1665
- readonly judgeTarget?: string;
1774
+ readonly graderTarget?: string;
1775
+ readonly workers?: number;
1776
+ readonly providerBatching?: boolean;
1777
+ readonly config: ClaudeResolvedConfig;
1778
+ } | {
1779
+ readonly kind: 'claude-cli';
1780
+ readonly name: string;
1781
+ readonly graderTarget?: string;
1782
+ readonly workers?: number;
1783
+ readonly providerBatching?: boolean;
1784
+ readonly config: ClaudeResolvedConfig;
1785
+ } | {
1786
+ readonly kind: 'claude-sdk';
1787
+ readonly name: string;
1788
+ readonly graderTarget?: string;
1666
1789
  readonly workers?: number;
1667
1790
  readonly providerBatching?: boolean;
1668
1791
  readonly config: ClaudeResolvedConfig;
1669
1792
  } | {
1670
1793
  readonly kind: 'mock';
1671
1794
  readonly name: string;
1672
- readonly judgeTarget?: string;
1795
+ readonly graderTarget?: string;
1673
1796
  readonly workers?: number;
1674
1797
  readonly providerBatching?: boolean;
1675
1798
  readonly config: MockResolvedConfig;
1676
1799
  } | {
1677
1800
  readonly kind: 'vscode' | 'vscode-insiders';
1678
1801
  readonly name: string;
1679
- readonly judgeTarget?: string;
1802
+ readonly graderTarget?: string;
1680
1803
  readonly workers?: number;
1681
1804
  readonly providerBatching?: boolean;
1682
1805
  readonly config: VSCodeResolvedConfig;
1806
+ } | {
1807
+ readonly kind: 'agentv';
1808
+ readonly name: string;
1809
+ readonly graderTarget?: string;
1810
+ readonly workers?: number;
1811
+ readonly providerBatching?: boolean;
1812
+ readonly config: AgentVResolvedConfig;
1683
1813
  } | {
1684
1814
  readonly kind: 'cli';
1685
1815
  readonly name: string;
1686
- readonly judgeTarget?: string;
1816
+ readonly graderTarget?: string;
1687
1817
  readonly workers?: number;
1688
1818
  readonly providerBatching?: boolean;
1689
1819
  readonly config: CliResolvedConfig;
@@ -1835,7 +1965,7 @@ declare function resolveAndCreateProvider(definition: TargetDefinition, env?: En
1835
1965
 
1836
1966
  /**
1837
1967
  * Function to resolve a target name to a provider.
1838
- * Used by code judges to support target override.
1968
+ * Used by code graders to support target override.
1839
1969
  */
1840
1970
  type TargetResolver = (targetName: string) => Provider | undefined;
1841
1971
  interface EvaluationContext {
@@ -1851,6 +1981,8 @@ interface EvaluationContext {
1851
1981
  readonly chatPrompt?: ChatPrompt;
1852
1982
  };
1853
1983
  readonly now: Date;
1984
+ readonly graderProvider?: Provider;
1985
+ /** @deprecated Use `graderProvider` instead */
1854
1986
  readonly judgeProvider?: Provider;
1855
1987
  readonly evaluatorTemplateOverride?: string;
1856
1988
  readonly evaluator?: EvaluatorConfig;
@@ -1868,9 +2000,9 @@ interface EvaluationContext {
1868
2000
  readonly startTime?: string;
1869
2001
  /** ISO 8601 timestamp when execution ended */
1870
2002
  readonly endTime?: string;
1871
- /** Resolver for target override in code judges */
2003
+ /** Resolver for target override in code graders */
1872
2004
  readonly targetResolver?: TargetResolver;
1873
- /** List of available target names for code judges */
2005
+ /** List of available target names for code graders */
1874
2006
  readonly availableTargets?: readonly string[];
1875
2007
  /** Unified diff of file changes from workspace (when workspace_template is configured) */
1876
2008
  readonly fileChanges?: string;
@@ -1886,7 +2018,7 @@ interface EvaluationScore {
1886
2018
  readonly reasoning?: string;
1887
2019
  readonly evaluatorRawRequest?: JsonObject;
1888
2020
  readonly scores?: readonly ChildEvaluatorResult[];
1889
- /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
2021
+ /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */
1890
2022
  readonly details?: JsonObject;
1891
2023
  /** Token usage from LLM calls made by this evaluator (optional). */
1892
2024
  readonly tokenUsage?: TokenUsage;
@@ -1902,7 +2034,7 @@ interface ChildEvaluatorResult {
1902
2034
  readonly reasoning?: string;
1903
2035
  readonly evaluatorRawRequest?: JsonObject;
1904
2036
  readonly scores?: readonly ChildEvaluatorResult[];
1905
- /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
2037
+ /** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */
1906
2038
  readonly details?: JsonObject;
1907
2039
  /** Token usage from LLM calls made by this evaluator (optional). */
1908
2040
  readonly tokenUsage?: TokenUsage;
@@ -2061,12 +2193,18 @@ declare class LatencyEvaluator implements Evaluator {
2061
2193
  * Custom evaluators can override this via evaluatorTemplate option.
2062
2194
  */
2063
2195
  declare const DEFAULT_EVALUATOR_TEMPLATE: string;
2064
- type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
2065
- interface LlmJudgeEvaluatorOptions {
2066
- readonly resolveJudgeProvider: JudgeProviderResolver;
2196
+ type GraderProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
2197
+ interface LlmGraderEvaluatorOptions {
2198
+ readonly resolveGraderProvider: GraderProviderResolver;
2199
+ /** @deprecated Use `resolveGraderProvider` instead. */
2200
+ readonly resolveJudgeProvider?: GraderProviderResolver;
2067
2201
  readonly maxOutputTokens?: number;
2068
2202
  readonly temperature?: number;
2069
2203
  readonly evaluatorTemplate?: string;
2204
+ readonly maxSteps?: number;
2205
+ readonly graderTargetProvider?: Provider;
2206
+ /** @deprecated Use `graderTargetProvider` instead. */
2207
+ readonly judgeTargetProvider?: Provider;
2070
2208
  }
2071
2209
  declare const freeformEvaluationSchema: z.ZodObject<{
2072
2210
  score: z.ZodNumber;
@@ -2115,13 +2253,15 @@ declare const rubricEvaluationSchema: z.ZodObject<{
2115
2253
  overall_reasoning: string;
2116
2254
  }>;
2117
2255
 
2118
- declare class LlmJudgeEvaluator implements Evaluator {
2119
- readonly kind = "llm-judge";
2120
- private readonly resolveJudgeProvider;
2256
+ declare class LlmGraderEvaluator implements Evaluator {
2257
+ readonly kind = "llm-grader";
2258
+ private readonly resolveGraderProvider;
2121
2259
  private readonly maxOutputTokens?;
2122
2260
  private readonly temperature?;
2123
2261
  private readonly evaluatorTemplate?;
2124
- constructor(options: LlmJudgeEvaluatorOptions);
2262
+ private readonly maxSteps;
2263
+ private readonly graderTargetProvider?;
2264
+ constructor(options: LlmGraderEvaluatorOptions);
2125
2265
  evaluate(context: EvaluationContext): Promise<EvaluationScore>;
2126
2266
  private evaluateFreeform;
2127
2267
  private evaluateWithRubrics;
@@ -2130,6 +2270,43 @@ declare class LlmJudgeEvaluator implements Evaluator {
2130
2270
  * Each criterion is scored 0-10 and normalized to 0-1.
2131
2271
  */
2132
2272
  private evaluateWithScoreRanges;
2273
+ /**
2274
+ * Built-in mode: Uses Vercel AI SDK generateText() with sandboxed filesystem tools.
2275
+ */
2276
+ private evaluateBuiltIn;
2277
+ /**
2278
+ * Grader target mode: Delegates to an explicit graderTargetProvider via Provider.invoke().
2279
+ */
2280
+ private evaluateWithGraderTarget;
2281
+ /**
2282
+ * Delegate mode: resolved provider is an agent provider — send prompt via invoke().
2283
+ */
2284
+ private evaluateWithDelegatedAgent;
2285
+ /**
2286
+ * Shared implementation for grader_target and delegate modes.
2287
+ * Both invoke a provider and parse the agent result from the response.
2288
+ */
2289
+ private evaluateWithDelegate;
2290
+ /**
2291
+ * Build system prompt for built-in agent mode.
2292
+ * Includes output format instructions.
2293
+ */
2294
+ private buildAgentSystemPrompt;
2295
+ /**
2296
+ * Build user prompt for built-in agent mode.
2297
+ * Uses custom template if provided, otherwise builds default prompt.
2298
+ */
2299
+ private buildAgentUserPrompt;
2300
+ /**
2301
+ * Build the full evaluation prompt for delegate mode (agent providers).
2302
+ * Combines task context, criteria, candidate info, and output format instructions.
2303
+ */
2304
+ private buildDelegatedPrompt;
2305
+ /**
2306
+ * Parse the agent's response text into an EvaluationScore.
2307
+ * Supports both freeform and rubric modes.
2308
+ */
2309
+ private parseAgentResult;
2133
2310
  /**
2134
2311
  * Build prompt for score-range rubric evaluation.
2135
2312
  */
@@ -2155,67 +2332,40 @@ declare function calculateRubricScore(result: z.infer<typeof rubricEvaluationSch
2155
2332
  */
2156
2333
  declare function buildScoreRangeOutputSchema(): string;
2157
2334
 
2158
- interface AgentJudgeEvaluatorOptions {
2159
- readonly resolveJudgeProvider: (ctx: EvaluationContext) => Promise<Provider | undefined>;
2160
- readonly maxSteps?: number;
2161
- readonly temperature?: number;
2162
- readonly evaluatorTemplate?: string;
2163
- readonly judgeTargetProvider?: Provider;
2164
- }
2165
- declare class AgentJudgeEvaluator implements Evaluator {
2166
- readonly kind = "agent-judge";
2167
- private readonly resolveJudgeProvider;
2168
- private readonly maxSteps;
2169
- private readonly temperature;
2170
- private readonly evaluatorTemplate?;
2171
- private readonly judgeTargetProvider?;
2172
- constructor(options: AgentJudgeEvaluatorOptions);
2173
- evaluate(context: EvaluationContext): Promise<EvaluationScore>;
2174
- /**
2175
- * Built-in mode: Uses Vercel AI SDK generateText() with sandboxed filesystem tools.
2176
- */
2177
- private evaluateBuiltIn;
2178
- /**
2179
- * Judge target mode: Delegates to an external agent provider via Provider.invoke().
2180
- */
2181
- private evaluateWithJudgeTarget;
2182
- /**
2183
- * Parse the agent's response text into an EvaluationScore.
2184
- * Supports both freeform and rubric modes.
2185
- */
2186
- private parseResult;
2187
- /**
2188
- * Build system prompt for built-in mode.
2189
- * Includes output format instructions.
2190
- */
2191
- private buildSystemPrompt;
2192
- /**
2193
- * Build user prompt for built-in mode.
2194
- * Uses custom template if provided, otherwise builds default prompt.
2195
- */
2196
- private buildUserPrompt;
2197
- /**
2198
- * Build the full evaluation prompt for judge target mode (delegation).
2199
- * Combines task context, criteria, candidate info, and output format instructions.
2200
- */
2201
- private buildDelegatedPrompt;
2335
+ /**
2336
+ * Built-in skill-trigger evaluator.
2337
+ *
2338
+ * Detects whether the agent invoked a named Claude Code skill as its first tool call.
2339
+ * Mirrors the post-hoc fallback detection in skill-creator's run_eval.py:
2340
+ * - Only the FIRST tool call matters.
2341
+ * - Skill tool: checks input.skill contains the skill name (case-sensitive substring).
2342
+ * - Read tool: checks input.file_path contains the skill name (case-sensitive substring).
2343
+ * - Any other tool as first call means the skill was not triggered.
2344
+ * - Supports negative cases via should_trigger: false.
2345
+ */
2346
+
2347
+ declare class SkillTriggerEvaluator implements Evaluator {
2348
+ readonly kind = "skill-trigger";
2349
+ private readonly config;
2350
+ constructor(config: SkillTriggerEvaluatorConfig);
2351
+ evaluate(context: EvaluationContext): EvaluationScore;
2202
2352
  }
2203
2353
 
2204
- interface LlmJudgePromptAssembly {
2354
+ interface LlmGraderPromptAssembly {
2205
2355
  systemPrompt: string;
2206
2356
  userPrompt: string;
2207
2357
  responseSchema: string;
2208
2358
  mode: 'freeform' | 'checklist' | 'score_range';
2209
2359
  }
2210
- declare function assembleLlmJudgePrompt(input: {
2360
+ declare function assembleLlmGraderPrompt(input: {
2211
2361
  evalCase: EvalTest;
2212
2362
  candidate: string;
2213
2363
  promptInputs: PromptInputs;
2214
- evaluatorConfig?: LlmJudgeEvaluatorConfig;
2364
+ evaluatorConfig?: LlmGraderEvaluatorConfig;
2215
2365
  output?: readonly Message[];
2216
2366
  fileChanges?: string;
2217
2367
  evaluatorTemplateOverride?: string;
2218
- }): LlmJudgePromptAssembly;
2368
+ }): LlmGraderPromptAssembly;
2219
2369
 
2220
2370
  interface TokenUsageEvaluatorOptions {
2221
2371
  readonly config: TokenUsageEvaluatorConfig;
@@ -2312,18 +2462,22 @@ declare function runEqualsAssertion(output: string, value: string): AssertionRes
2312
2462
  * Contains shared resources needed by evaluator instances.
2313
2463
  */
2314
2464
  interface EvaluatorDispatchContext {
2315
- /** Shared LLM judge provider (resolved at suite level) */
2465
+ /** Shared LLM grader provider (resolved at suite level) */
2466
+ readonly graderProvider?: Provider;
2467
+ /** @deprecated Use `graderProvider` instead */
2316
2468
  readonly judgeProvider?: Provider;
2317
2469
  /** Function to resolve target names to providers */
2318
2470
  readonly targetResolver?: TargetResolver;
2319
- /** Available target names for code judges */
2471
+ /** Available target names for code graders */
2320
2472
  readonly availableTargets?: readonly string[];
2321
2473
  /** Agent timeout in ms */
2322
2474
  readonly agentTimeoutMs?: number;
2323
2475
  /** Directory containing the eval file (for composite member resolution) */
2324
2476
  readonly evalFileDir?: string;
2325
- /** Shared LLM judge evaluator instance */
2326
- readonly llmJudge: Evaluator;
2477
+ /** Shared LLM grader evaluator instance */
2478
+ readonly llmGrader: Evaluator;
2479
+ /** @deprecated Use `llmGrader` instead */
2480
+ readonly llmJudge?: Evaluator;
2327
2481
  /** Reference to the registry itself (for composite evaluators that need to create children) */
2328
2482
  readonly registry: EvaluatorRegistry;
2329
2483
  }
@@ -2331,8 +2485,8 @@ interface EvaluatorDispatchContext {
2331
2485
  * Factory function that creates an Evaluator instance from a config.
2332
2486
  *
2333
2487
  * Factory functions handle all type-specific initialization logic:
2334
- * - Reading prompt files for LLM judges
2335
- * - Resolving script paths for code judges
2488
+ * - Reading prompt files for LLM graders
2489
+ * - Resolving script paths for code graders
2336
2490
  * - Creating adapter evaluators for deterministic assertions
2337
2491
  */
2338
2492
  type EvaluatorFactoryFn = (config: EvaluatorConfig, context: EvaluatorDispatchContext) => Evaluator | Promise<Evaluator>;
@@ -2394,7 +2548,7 @@ interface RunEvalCaseOptions {
2394
2548
  readonly provider: Provider;
2395
2549
  readonly target: ResolvedTarget;
2396
2550
  readonly evaluators: Partial<Record<string, Evaluator>> & {
2397
- readonly 'llm-judge': Evaluator;
2551
+ readonly 'llm-grader': Evaluator;
2398
2552
  };
2399
2553
  readonly now?: () => Date;
2400
2554
  readonly maxRetries?: number;
@@ -2402,10 +2556,10 @@ interface RunEvalCaseOptions {
2402
2556
  readonly cache?: EvaluationCache;
2403
2557
  readonly useCache?: boolean;
2404
2558
  readonly signal?: AbortSignal;
2405
- readonly judgeProvider?: Provider;
2406
- /** Resolver for target override in code judges */
2559
+ readonly graderProvider?: Provider;
2560
+ /** Resolver for target override in code graders */
2407
2561
  readonly targetResolver?: (name: string) => Provider | undefined;
2408
- /** List of available target names for code judges */
2562
+ /** List of available target names for code graders */
2409
2563
  readonly availableTargets?: readonly string[];
2410
2564
  /** Unique identifier for the evaluation run (used for workspace management) */
2411
2565
  readonly evalRunId?: string;
@@ -2488,10 +2642,44 @@ interface RunEvaluationOptions {
2488
2642
  readonly retainOnSuccess?: 'keep' | 'cleanup';
2489
2643
  /** Retention policy override for failed cases */
2490
2644
  readonly retainOnFailure?: 'keep' | 'cleanup';
2645
+ /** CLI override: grader target name (e.g., "agentv" or a target from targets.yaml) */
2646
+ readonly graderTarget?: string;
2647
+ /** CLI override: model for grader target (e.g., "openai:gpt-5-mini") */
2648
+ readonly model?: string;
2491
2649
  }
2492
2650
  declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
2493
2651
  declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
2494
2652
 
2653
+ /**
2654
+ * Types for inline assertion functions used in the evaluate() API.
2655
+ *
2656
+ * Inline functions are the escape hatch for custom evaluation logic
2657
+ * that doesn't fit a built-in evaluator type. For built-in assertions
2658
+ * (contains, regex, is-json, etc.), use config objects instead:
2659
+ *
2660
+ * assert: [{ type: 'contains', value: 'hello' }]
2661
+ *
2662
+ * Inline functions are for custom logic:
2663
+ *
2664
+ * assert: [({ output }) => ({ name: 'len', score: output.length > 5 ? 1 : 0 })]
2665
+ */
2666
+ /** Context passed to inline assertion functions */
2667
+ interface AssertContext {
2668
+ readonly input: string;
2669
+ readonly output: string;
2670
+ readonly expectedOutput?: string;
2671
+ readonly criteria?: string;
2672
+ readonly metadata?: Record<string, unknown>;
2673
+ }
2674
+ /** Result from an inline assertion function */
2675
+ interface AssertResult {
2676
+ readonly name: string;
2677
+ readonly score: number;
2678
+ readonly metadata?: Record<string, unknown>;
2679
+ }
2680
+ /** Inline assertion function signature */
2681
+ type AssertFn = (ctx: AssertContext) => AssertResult | Promise<AssertResult>;
2682
+
2495
2683
  /**
2496
2684
  * Programmatic API for running evaluations.
2497
2685
  *
@@ -2499,7 +2687,7 @@ declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationRes
2499
2687
  * instead of a CLI. The config shape mirrors the YAML structure for easy
2500
2688
  * translation between file-based and programmatic usage.
2501
2689
  *
2502
- * @example Inline tests
2690
+ * @example Inline tests with config objects
2503
2691
  * ```typescript
2504
2692
  * import { evaluate } from '@agentv/core';
2505
2693
  *
@@ -2508,7 +2696,7 @@ declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationRes
2508
2696
  * {
2509
2697
  * id: 'capital',
2510
2698
  * input: 'What is the capital of France?',
2511
- * expected_output: 'Paris',
2699
+ * expectedOutput: 'Paris',
2512
2700
  * assert: [{ type: 'contains', value: 'Paris' }],
2513
2701
  * },
2514
2702
  * ],
@@ -2518,6 +2706,27 @@ declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationRes
2518
2706
  * console.log(results.summary.passed, 'passed');
2519
2707
  * ```
2520
2708
  *
2709
+ * @example Inline tests with task function and custom assertion
2710
+ * ```typescript
2711
+ * import { evaluate } from '@agentv/core';
2712
+ *
2713
+ * const { summary } = await evaluate({
2714
+ * tests: [
2715
+ * {
2716
+ * id: 'echo',
2717
+ * input: 'hello',
2718
+ * expectedOutput: 'Echo: hello',
2719
+ * assert: [
2720
+ * { type: 'contains', value: 'hello' },
2721
+ * { type: 'equals' },
2722
+ * ({ output }) => ({ name: 'custom', score: output.length > 0 ? 1 : 0 }),
2723
+ * ],
2724
+ * },
2725
+ * ],
2726
+ * task: async (input) => `Echo: ${input}`,
2727
+ * });
2728
+ * ```
2729
+ *
2521
2730
  * @example File-based
2522
2731
  * ```typescript
2523
2732
  * const results = await evaluate({
@@ -2543,10 +2752,12 @@ interface EvalTestInput {
2543
2752
  role: string;
2544
2753
  content: string;
2545
2754
  }[];
2546
- /** Expected reference output */
2755
+ /** Expected reference output (camelCase preferred) */
2756
+ readonly expectedOutput?: string;
2757
+ /** @deprecated Use `expectedOutput` instead */
2547
2758
  readonly expected_output?: string;
2548
- /** Assertion evaluators */
2549
- readonly assert?: readonly EvalAssertionInput[];
2759
+ /** Assertion evaluators — accepts factory functions, config objects, or inline functions */
2760
+ readonly assert?: readonly AssertEntry[];
2550
2761
  /** Arbitrary metadata */
2551
2762
  readonly metadata?: Record<string, unknown>;
2552
2763
  }
@@ -2582,6 +2793,8 @@ interface EvalAssertionInput {
2582
2793
  /** Additional properties */
2583
2794
  readonly [key: string]: unknown;
2584
2795
  }
2796
+ /** Assert entry: inline function or config object */
2797
+ type AssertEntry = AssertFn | EvalAssertionInput;
2585
2798
  /**
2586
2799
  * Configuration for `evaluate()`.
2587
2800
  * Accepts either inline tests or a spec file path.
@@ -2593,8 +2806,10 @@ interface EvalConfig {
2593
2806
  readonly specFile?: string;
2594
2807
  /** Target provider configuration */
2595
2808
  readonly target?: TargetDefinition;
2809
+ /** Custom task function — mutually exclusive with target */
2810
+ readonly task?: (input: string) => string | Promise<string>;
2596
2811
  /** Suite-level assertions applied to all tests */
2597
- readonly assert?: readonly EvalAssertionInput[];
2812
+ readonly assert?: readonly AssertEntry[];
2598
2813
  /** Filter tests by ID pattern (glob supported) */
2599
2814
  readonly filter?: string;
2600
2815
  /** Maximum concurrent workers (default: 3) */
@@ -3207,6 +3422,11 @@ declare class OtelStreamingObserver {
3207
3422
  onLlmCall(model: string, tokenUsage?: ProviderTokenUsage): void;
3208
3423
  /** Finalize root span with score/verdict after evaluation completes */
3209
3424
  finalizeEvalCase(score: number, error?: string): void;
3425
+ /** Return the active eval span's trace ID and span ID for Braintrust trace bridging */
3426
+ getActiveSpanIds(): {
3427
+ parentSpanId: string;
3428
+ rootSpanId: string;
3429
+ } | null;
3210
3430
  /** Get ProviderStreamCallbacks for passing to providers */
3211
3431
  getStreamCallbacks(): ProviderStreamCallbacks;
3212
3432
  }
@@ -3283,9 +3503,29 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
3283
3503
  */
3284
3504
  declare function discoverAssertions(registry: EvaluatorRegistry, baseDir: string): Promise<string[]>;
3285
3505
 
3506
+ /**
3507
+ * Convention-based discovery of custom grader scripts.
3508
+ *
3509
+ * Scans `.agentv/graders/` (and legacy `.agentv/judges/`) for TypeScript/JavaScript
3510
+ * files and registers them as code-grader evaluators in the registry. The file name
3511
+ * (without extension) becomes the evaluator type name.
3512
+ *
3513
+ * Example: `.agentv/graders/custom-grader.ts` → type "custom-grader" in EVAL.yaml
3514
+ */
3515
+
3516
+ /**
3517
+ * Discover custom grader scripts from `.agentv/graders/` (and legacy `.agentv/judges/`)
3518
+ * and register them as evaluator types in the registry.
3519
+ *
3520
+ * @param registry - The evaluator registry to register discovered graders into
3521
+ * @param baseDir - The base directory to search from (typically project root or eval file dir)
3522
+ * @returns Names of discovered grader types
3523
+ */
3524
+ declare function discoverGraders(registry: EvaluatorRegistry, baseDir: string): Promise<string[]>;
3525
+
3286
3526
  type AgentKernel = {
3287
3527
  status: string;
3288
3528
  };
3289
3529
  declare function createAgentKernel(): AgentKernel;
3290
3530
 
3291
- export { type AcquireWorkspaceOptions, AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getHitCount, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
3531
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getHitCount, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };