@agentv/core 4.18.0-next.1 → 4.19.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1455,156 +1455,558 @@ interface GraderResult {
1455
1455
  readonly endedAt?: string;
1456
1456
  }
1457
1457
 
1458
- declare const MetadataSchema: z.ZodObject<{
1459
- name: z.ZodString;
1460
- description: z.ZodOptional<z.ZodString>;
1461
- version: z.ZodOptional<z.ZodString>;
1462
- author: z.ZodOptional<z.ZodString>;
1463
- tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
1464
- license: z.ZodOptional<z.ZodString>;
1465
- requires: z.ZodOptional<z.ZodObject<{
1466
- agentv: z.ZodOptional<z.ZodString>;
1467
- }, "strip", z.ZodTypeAny, {
1468
- agentv?: string | undefined;
1458
+ /**
1459
+ * Strict normalized schema for CLI target configuration.
1460
+ * This is the final validated shape after environment variable resolution
1461
+ * and internal field normalization.
1462
+ *
1463
+ * Uses .strict() to reject unknown properties, ensuring configuration
1464
+ * errors are caught early rather than silently ignored.
1465
+ *
1466
+ * @example
1467
+ * ```typescript
1468
+ * const config: CliNormalizedConfig = {
1469
+ * command: 'agent run {PROMPT}',
1470
+ * timeoutMs: 120000,
1471
+ * verbose: true,
1472
+ * };
1473
+ * CliTargetConfigSchema.parse(config); // Validates the normalized config
1474
+ * ```
1475
+ */
1476
+ declare const CliTargetConfigSchema: z.ZodObject<{
1477
+ command: z.ZodString;
1478
+ filesFormat: z.ZodOptional<z.ZodString>;
1479
+ cwd: z.ZodOptional<z.ZodString>;
1480
+ timeoutMs: z.ZodOptional<z.ZodNumber>;
1481
+ healthcheck: z.ZodOptional<z.ZodUnion<[z.ZodObject<{
1482
+ url: z.ZodString;
1483
+ timeoutMs: z.ZodOptional<z.ZodNumber>;
1484
+ }, "strict", z.ZodTypeAny, {
1485
+ url: string;
1486
+ timeoutMs?: number | undefined;
1469
1487
  }, {
1470
- agentv?: string | undefined;
1471
- }>>;
1472
- }, "strip", z.ZodTypeAny, {
1473
- name: string;
1474
- description?: string | undefined;
1475
- version?: string | undefined;
1476
- author?: string | undefined;
1477
- tags?: string[] | undefined;
1478
- license?: string | undefined;
1479
- requires?: {
1480
- agentv?: string | undefined;
1488
+ url: string;
1489
+ timeoutMs?: number | undefined;
1490
+ }>, z.ZodObject<{
1491
+ command: z.ZodString;
1492
+ cwd: z.ZodOptional<z.ZodString>;
1493
+ timeoutMs: z.ZodOptional<z.ZodNumber>;
1494
+ }, "strict", z.ZodTypeAny, {
1495
+ command: string;
1496
+ timeoutMs?: number | undefined;
1497
+ cwd?: string | undefined;
1498
+ }, {
1499
+ command: string;
1500
+ timeoutMs?: number | undefined;
1501
+ cwd?: string | undefined;
1502
+ }>]>>;
1503
+ verbose: z.ZodOptional<z.ZodBoolean>;
1504
+ keepTempFiles: z.ZodOptional<z.ZodBoolean>;
1505
+ }, "strict", z.ZodTypeAny, {
1506
+ command: string;
1507
+ timeoutMs?: number | undefined;
1508
+ cwd?: string | undefined;
1509
+ verbose?: boolean | undefined;
1510
+ healthcheck?: {
1511
+ url: string;
1512
+ timeoutMs?: number | undefined;
1513
+ } | {
1514
+ command: string;
1515
+ timeoutMs?: number | undefined;
1516
+ cwd?: string | undefined;
1481
1517
  } | undefined;
1518
+ filesFormat?: string | undefined;
1519
+ keepTempFiles?: boolean | undefined;
1482
1520
  }, {
1483
- name: string;
1484
- description?: string | undefined;
1485
- version?: string | undefined;
1486
- author?: string | undefined;
1487
- tags?: string[] | undefined;
1488
- license?: string | undefined;
1489
- requires?: {
1490
- agentv?: string | undefined;
1521
+ command: string;
1522
+ timeoutMs?: number | undefined;
1523
+ cwd?: string | undefined;
1524
+ verbose?: boolean | undefined;
1525
+ healthcheck?: {
1526
+ url: string;
1527
+ timeoutMs?: number | undefined;
1528
+ } | {
1529
+ command: string;
1530
+ timeoutMs?: number | undefined;
1531
+ cwd?: string | undefined;
1491
1532
  } | undefined;
1533
+ filesFormat?: string | undefined;
1534
+ keepTempFiles?: boolean | undefined;
1492
1535
  }>;
1493
- type EvalMetadata = z.infer<typeof MetadataSchema>;
1494
-
1495
- declare const DEFAULT_EVAL_PATTERNS: readonly string[];
1496
- type ExecutionDefaults = {
1497
- readonly verbose?: boolean;
1498
- readonly keep_workspaces?: boolean;
1499
- readonly otel_file?: string;
1500
- readonly export_otel?: boolean;
1501
- readonly otel_backend?: string;
1502
- readonly otel_capture_content?: boolean;
1503
- readonly otel_group_turns?: boolean;
1504
- readonly pool_workspaces?: boolean;
1505
- readonly pool_slots?: number;
1506
- };
1507
- type ResultsExportConfig = {
1508
- readonly repo: string;
1509
- readonly path: string;
1510
- readonly auto_push?: boolean;
1511
- readonly branch_prefix?: string;
1512
- };
1513
- type AgentVConfig$1 = {
1514
- readonly required_version?: string;
1515
- readonly eval_patterns?: readonly string[];
1516
- readonly execution?: ExecutionDefaults;
1517
- readonly results?: {
1518
- readonly export?: ResultsExportConfig;
1519
- };
1520
- };
1521
- /**
1522
- * Load optional .agentv/config.yaml configuration file.
1523
- * Searches from eval file directory up to repo root.
1524
- */
1525
- declare function loadConfig(evalFilePath: string, repoRoot: string): Promise<AgentVConfig$1 | null>;
1526
- /**
1527
- * Extract target name from parsed eval suite (checks execution.target then falls back to root-level target).
1528
- */
1529
- declare function extractTargetFromSuite(suite: JsonObject): string | undefined;
1530
- /**
1531
- * Extract target refs from parsed eval suite.
1532
- * Supports both string shorthand and object form with hooks.
1533
- * Returns undefined when no targets array is specified.
1534
- */
1535
- declare function extractTargetRefsFromSuite(suite: JsonObject): readonly EvalTargetRef[] | undefined;
1536
- /**
1537
- * Extract target names from parsed eval suite (backward-compat wrapper).
1538
- * Precedence: execution.targets (array) > execution.target (singular).
1539
- * Returns undefined when no targets array is specified.
1540
- */
1541
- declare function extractTargetsFromSuite(suite: JsonObject): readonly string[] | undefined;
1542
- /**
1543
- * Extract workers count from suite-level execution block.
1544
- */
1545
- declare function extractWorkersFromSuite(suite: JsonObject): number | undefined;
1546
- /**
1547
- * Extract per-test targets array from a raw test case object.
1548
- */
1549
- declare function extractTargetsFromTestCase(testCase: JsonObject): readonly string[] | undefined;
1550
- /**
1551
- * Extract trials configuration from parsed eval suite's execution block.
1552
- * Returns undefined when count is 1 or not specified (no-op).
1553
- */
1554
- declare function extractTrialsConfig(suite: JsonObject): TrialsConfig | undefined;
1536
+ type CliNormalizedConfig = z.infer<typeof CliTargetConfigSchema>;
1555
1537
  /**
1556
- * Cache configuration parsed from execution block.
1538
+ * Resolved CLI configuration type derived from CliTargetConfigSchema.
1539
+ * This is the final validated shape used by the CLI provider at runtime.
1540
+ * Using Readonly to ensure immutability for runtime safety.
1557
1541
  */
1558
- interface CacheConfig {
1559
- readonly enabled: boolean;
1560
- readonly cachePath?: string;
1542
+ type CliResolvedConfig = Readonly<CliNormalizedConfig>;
1543
+ interface RetryConfig {
1544
+ readonly maxRetries?: number;
1545
+ readonly initialDelayMs?: number;
1546
+ readonly maxDelayMs?: number;
1547
+ readonly backoffFactor?: number;
1548
+ readonly retryableStatusCodes?: readonly number[];
1561
1549
  }
1562
1550
  /**
1563
- * Extract cache configuration from parsed eval suite's execution block.
1564
- * Returns undefined when no cache config is specified.
1565
- */
1566
- declare function extractCacheConfig(suite: JsonObject): CacheConfig | undefined;
1567
- /**
1568
- * Extract `execution.fail_on_error` from parsed eval suite.
1569
- * Accepts `true` or `false`.
1570
- * Returns undefined when not specified.
1551
+ * Selects which OpenAI-compatible API endpoint to use.
1552
+ * - "chat" (default): POST /chat/completions universally supported by all OpenAI-compatible providers.
1553
+ * - "responses": POST /responses — only supported by api.openai.com.
1554
+ *
1555
+ * Maps to Vercel AI SDK methods: "chat" → provider.chat(model), "responses" → provider(model).
1571
1556
  */
1572
- declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
1557
+ type ApiFormat = 'chat' | 'responses';
1573
1558
  /**
1574
- * Extract `execution.threshold` from parsed eval suite.
1575
- * Accepts a number in [0, 1] range.
1576
- * Returns undefined when not specified.
1559
+ * Azure OpenAI settings used by the Vercel AI SDK.
1577
1560
  */
1578
- declare function extractThreshold(suite: JsonObject): number | undefined;
1579
-
1561
+ interface AzureResolvedConfig {
1562
+ readonly resourceName: string;
1563
+ readonly deploymentName: string;
1564
+ readonly apiKey: string;
1565
+ readonly version?: string;
1566
+ readonly apiFormat?: ApiFormat;
1567
+ readonly temperature?: number;
1568
+ readonly maxOutputTokens?: number;
1569
+ readonly retry?: RetryConfig;
1570
+ }
1580
1571
  /**
1581
- * Formatting mode for segment content.
1582
- * - 'agent': File references only (for providers with filesystem access)
1583
- * - 'lm': Embedded file content with XML tags (for language model providers)
1572
+ * OpenAI-compatible settings used by the Vercel AI SDK.
1584
1573
  */
1585
- type FormattingMode = 'agent' | 'lm';
1586
-
1574
+ interface OpenAIResolvedConfig {
1575
+ readonly baseURL: string;
1576
+ readonly apiKey: string;
1577
+ readonly model: string;
1578
+ readonly apiFormat?: ApiFormat;
1579
+ readonly temperature?: number;
1580
+ readonly maxOutputTokens?: number;
1581
+ readonly retry?: RetryConfig;
1582
+ }
1587
1583
  /**
1588
- * Build prompt inputs by consolidating user request context.
1584
+ * OpenRouter settings used by the Vercel AI SDK provider.
1589
1585
  */
1590
- interface PromptInputs {
1591
- readonly question: string;
1592
- readonly chatPrompt?: ChatPrompt;
1593
- readonly systemMessage?: string;
1586
+ interface OpenRouterResolvedConfig {
1587
+ readonly apiKey: string;
1588
+ readonly model: string;
1589
+ readonly temperature?: number;
1590
+ readonly maxOutputTokens?: number;
1591
+ readonly retry?: RetryConfig;
1594
1592
  }
1595
1593
  /**
1596
- * Build prompt inputs by consolidating user request context.
1597
- *
1598
- * @param testCase - The evaluation test case
1599
- * @param mode - Formatting mode: 'agent' for file references, 'lm' for embedded content (default: 'lm')
1594
+ * Anthropic Claude settings used by the Vercel AI SDK.
1600
1595
  */
1601
- declare function buildPromptInputs(testCase: EvalTest, mode?: FormattingMode): Promise<PromptInputs>;
1602
-
1596
+ interface AnthropicResolvedConfig {
1597
+ readonly apiKey: string;
1598
+ readonly model: string;
1599
+ readonly temperature?: number;
1600
+ readonly maxOutputTokens?: number;
1601
+ readonly thinkingBudget?: number;
1602
+ readonly retry?: RetryConfig;
1603
+ }
1603
1604
  /**
1604
- * Detect file format by extension.
1605
+ * Google Gemini settings used by the Vercel AI SDK.
1605
1606
  */
1606
- declare function detectFormat(filePath: string): 'yaml' | 'jsonl' | 'agent-skills-json';
1607
-
1607
+ interface GeminiResolvedConfig {
1608
+ readonly apiKey: string;
1609
+ readonly model: string;
1610
+ readonly temperature?: number;
1611
+ readonly maxOutputTokens?: number;
1612
+ readonly retry?: RetryConfig;
1613
+ }
1614
+ interface CodexResolvedConfig {
1615
+ readonly model?: string;
1616
+ readonly executable: string;
1617
+ readonly args?: readonly string[];
1618
+ readonly cwd?: string;
1619
+ readonly timeoutMs?: number;
1620
+ readonly logDir?: string;
1621
+ readonly logFormat?: 'summary' | 'json';
1622
+ /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
1623
+ readonly streamLog?: false | 'raw' | 'summary';
1624
+ readonly systemPrompt?: string;
1625
+ }
1626
+ interface CopilotCliResolvedConfig {
1627
+ readonly executable: string;
1628
+ readonly model?: string;
1629
+ readonly args?: readonly string[];
1630
+ readonly cwd?: string;
1631
+ readonly timeoutMs?: number;
1632
+ readonly logDir?: string;
1633
+ readonly logFormat?: 'summary' | 'json';
1634
+ /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
1635
+ readonly streamLog?: false | 'raw' | 'summary';
1636
+ readonly systemPrompt?: string;
1637
+ }
1638
+ interface CopilotSdkResolvedConfig {
1639
+ readonly cliUrl?: string;
1640
+ readonly cliPath?: string;
1641
+ readonly githubToken?: string;
1642
+ readonly model?: string;
1643
+ readonly cwd?: string;
1644
+ readonly timeoutMs?: number;
1645
+ readonly logDir?: string;
1646
+ readonly logFormat?: 'summary' | 'json';
1647
+ /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
1648
+ readonly streamLog?: false | 'raw' | 'summary';
1649
+ readonly systemPrompt?: string;
1650
+ /** BYOK provider type: "azure", "openai", or "anthropic". */
1651
+ readonly byokType?: string;
1652
+ /** BYOK base URL for the provider endpoint. */
1653
+ readonly byokBaseUrl?: string;
1654
+ /** BYOK API key for authenticating with the provider. */
1655
+ readonly byokApiKey?: string;
1656
+ /** BYOK bearer token (takes precedence over apiKey when set). */
1657
+ readonly byokBearerToken?: string;
1658
+ /** BYOK Azure API version (e.g. "2024-10-21"). Only used when byokType is "azure". */
1659
+ readonly byokApiVersion?: string;
1660
+ /** BYOK wire API format: "completions" or "responses". */
1661
+ readonly byokWireApi?: string;
1662
+ }
1663
+ interface CopilotLogResolvedConfig {
1664
+ /** Explicit path to a session directory containing events.jsonl. */
1665
+ readonly sessionDir?: string;
1666
+ /** Session UUID — combined with sessionStateDir to build the path. */
1667
+ readonly sessionId?: string;
1668
+ /** Auto-discovery mode. 'latest' picks the most recent session. */
1669
+ readonly discover?: 'latest';
1670
+ /** Override the default ~/.copilot/session-state directory. */
1671
+ readonly sessionStateDir?: string;
1672
+ /** Filter discovery by working directory. */
1673
+ readonly cwd?: string;
1674
+ }
1675
+ interface PiCodingAgentResolvedConfig {
1676
+ readonly subprovider?: string;
1677
+ readonly model?: string;
1678
+ readonly apiKey?: string;
1679
+ readonly baseUrl?: string;
1680
+ readonly tools?: string;
1681
+ readonly thinking?: string;
1682
+ readonly cwd?: string;
1683
+ readonly timeoutMs?: number;
1684
+ readonly logDir?: string;
1685
+ readonly logFormat?: 'summary' | 'json';
1686
+ /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
1687
+ readonly streamLog?: false | 'raw' | 'summary';
1688
+ readonly systemPrompt?: string;
1689
+ }
1690
+ interface PiCliResolvedConfig {
1691
+ readonly executable: string;
1692
+ readonly subprovider?: string;
1693
+ readonly model?: string;
1694
+ readonly apiKey?: string;
1695
+ readonly baseUrl?: string;
1696
+ readonly tools?: string;
1697
+ readonly thinking?: string;
1698
+ readonly args?: readonly string[];
1699
+ readonly cwd?: string;
1700
+ readonly timeoutMs?: number;
1701
+ readonly logDir?: string;
1702
+ readonly logFormat?: 'summary' | 'json';
1703
+ /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
1704
+ readonly streamLog?: false | 'raw' | 'summary';
1705
+ readonly systemPrompt?: string;
1706
+ }
1707
+ interface ClaudeResolvedConfig {
1708
+ readonly executable: string;
1709
+ readonly model?: string;
1710
+ readonly systemPrompt?: string;
1711
+ readonly cwd?: string;
1712
+ readonly timeoutMs?: number;
1713
+ readonly maxTurns?: number;
1714
+ readonly maxBudgetUsd?: number;
1715
+ readonly logDir?: string;
1716
+ readonly logFormat?: 'summary' | 'json';
1717
+ /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
1718
+ readonly streamLog?: false | 'raw' | 'summary';
1719
+ }
1720
+ interface MockResolvedConfig {
1721
+ readonly response?: string;
1722
+ readonly delayMs?: number;
1723
+ readonly delayMinMs?: number;
1724
+ readonly delayMaxMs?: number;
1725
+ }
1726
+ interface VSCodeResolvedConfig {
1727
+ readonly executable: string;
1728
+ readonly waitForResponse: boolean;
1729
+ readonly dryRun: boolean;
1730
+ readonly subagentRoot?: string;
1731
+ readonly timeoutMs?: number;
1732
+ }
1733
+ interface AgentVResolvedConfig {
1734
+ readonly model: string;
1735
+ readonly temperature: number;
1736
+ }
1737
+ /** Base fields shared by all resolved targets. */
1738
+ interface ResolvedTargetBase {
1739
+ readonly name: string;
1740
+ readonly graderTarget?: string;
1741
+ readonly workers?: number;
1742
+ readonly providerBatching?: boolean;
1743
+ /**
1744
+ * Whether this target can be executed via executor subagents in subagent mode.
1745
+ * Defaults to `true` for all non-CLI providers. Set `false` in targets.yaml
1746
+ * to force CLI invocation even in subagent mode.
1747
+ */
1748
+ readonly subagentModeAllowed?: boolean;
1749
+ /**
1750
+ * Ordered list of target names to try when the primary target fails after
1751
+ * exhausting retries. Each fallback is attempted in order.
1752
+ */
1753
+ readonly fallbackTargets?: readonly string[];
1754
+ }
1755
+ type ResolvedTarget = (ResolvedTargetBase & {
1756
+ readonly kind: 'openai';
1757
+ readonly config: OpenAIResolvedConfig;
1758
+ }) | (ResolvedTargetBase & {
1759
+ readonly kind: 'openrouter';
1760
+ readonly config: OpenRouterResolvedConfig;
1761
+ }) | (ResolvedTargetBase & {
1762
+ readonly kind: 'azure';
1763
+ readonly config: AzureResolvedConfig;
1764
+ }) | (ResolvedTargetBase & {
1765
+ readonly kind: 'anthropic';
1766
+ readonly config: AnthropicResolvedConfig;
1767
+ }) | (ResolvedTargetBase & {
1768
+ readonly kind: 'gemini';
1769
+ readonly config: GeminiResolvedConfig;
1770
+ }) | (ResolvedTargetBase & {
1771
+ readonly kind: 'codex';
1772
+ readonly config: CodexResolvedConfig;
1773
+ }) | (ResolvedTargetBase & {
1774
+ readonly kind: 'copilot-sdk';
1775
+ readonly config: CopilotSdkResolvedConfig;
1776
+ }) | (ResolvedTargetBase & {
1777
+ readonly kind: 'copilot-cli';
1778
+ readonly config: CopilotCliResolvedConfig;
1779
+ }) | (ResolvedTargetBase & {
1780
+ readonly kind: 'copilot-log';
1781
+ readonly config: CopilotLogResolvedConfig;
1782
+ }) | (ResolvedTargetBase & {
1783
+ readonly kind: 'pi-coding-agent';
1784
+ readonly config: PiCodingAgentResolvedConfig;
1785
+ }) | (ResolvedTargetBase & {
1786
+ readonly kind: 'pi-cli';
1787
+ readonly config: PiCliResolvedConfig;
1788
+ }) | (ResolvedTargetBase & {
1789
+ readonly kind: 'claude';
1790
+ readonly config: ClaudeResolvedConfig;
1791
+ }) | (ResolvedTargetBase & {
1792
+ readonly kind: 'claude-cli';
1793
+ readonly config: ClaudeResolvedConfig;
1794
+ }) | (ResolvedTargetBase & {
1795
+ readonly kind: 'claude-sdk';
1796
+ readonly config: ClaudeResolvedConfig;
1797
+ }) | (ResolvedTargetBase & {
1798
+ readonly kind: 'mock';
1799
+ readonly config: MockResolvedConfig;
1800
+ }) | (ResolvedTargetBase & {
1801
+ readonly kind: 'vscode' | 'vscode-insiders';
1802
+ readonly config: VSCodeResolvedConfig;
1803
+ }) | (ResolvedTargetBase & {
1804
+ readonly kind: 'agentv';
1805
+ readonly config: AgentVResolvedConfig;
1806
+ }) | (ResolvedTargetBase & {
1807
+ readonly kind: 'cli';
1808
+ readonly config: CliResolvedConfig;
1809
+ }) | (ResolvedTargetBase & {
1810
+ readonly kind: 'transcript';
1811
+ readonly config: Record<string, never>;
1812
+ });
1813
+ /**
1814
+ * Optional settings accepted on ALL target definitions regardless of provider.
1815
+ * Exported so the targets validator can reuse the same list — adding a field
1816
+ * here automatically makes it valid in targets.yaml without a separate update.
1817
+ */
1818
+ declare const COMMON_TARGET_SETTINGS: readonly ["use_target", "provider_batching", "subagent_mode_allowed", "fallback_targets"];
1819
+ declare function resolveDelegatedTargetDefinition(name: string, definitions: ReadonlyMap<string, TargetDefinition>, env?: EnvLookup): TargetDefinition | undefined;
1820
+ declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string, options?: {
1821
+ readonly emitDeprecationWarnings?: boolean;
1822
+ }): ResolvedTarget;
1823
+
1824
+ /**
1825
+ * Extensible provider registry.
1826
+ *
1827
+ * Replaces the hardcoded switch/case dispatch in createProvider() with
1828
+ * a registry of named factory functions. Built-in providers are registered
1829
+ * at startup; users can add custom providers via the registry API or by
1830
+ * dropping files in `.agentv/providers/`.
1831
+ */
1832
+
1833
+ /**
1834
+ * Factory function that creates a Provider instance from a resolved target.
1835
+ */
1836
+ type ProviderFactoryFn = (target: ResolvedTarget) => Provider;
1837
+ /**
1838
+ * Registry of provider factory functions keyed by provider kind.
1839
+ *
1840
+ * Built-in providers are registered at startup. Custom providers can be
1841
+ * registered via the `register()` method.
1842
+ */
1843
+ declare class ProviderRegistry {
1844
+ private readonly factories;
1845
+ /** Register a factory function for a provider kind. */
1846
+ register(kind: string, factory: ProviderFactoryFn): this;
1847
+ /** Get the factory function for a provider kind. */
1848
+ get(kind: string): ProviderFactoryFn | undefined;
1849
+ /** Check if a factory is registered for the given kind. */
1850
+ has(kind: string): boolean;
1851
+ /** List all registered provider kind names. */
1852
+ list(): string[];
1853
+ /**
1854
+ * Create a provider instance from a resolved target.
1855
+ * Falls back to CLI provider for unknown kinds (custom provider escape hatch).
1856
+ */
1857
+ create(target: ResolvedTarget): Provider;
1858
+ }
1859
+
1860
+ declare const MetadataSchema: z.ZodObject<{
1861
+ name: z.ZodString;
1862
+ description: z.ZodOptional<z.ZodString>;
1863
+ version: z.ZodOptional<z.ZodString>;
1864
+ author: z.ZodOptional<z.ZodString>;
1865
+ tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
1866
+ license: z.ZodOptional<z.ZodString>;
1867
+ requires: z.ZodOptional<z.ZodObject<{
1868
+ agentv: z.ZodOptional<z.ZodString>;
1869
+ }, "strip", z.ZodTypeAny, {
1870
+ agentv?: string | undefined;
1871
+ }, {
1872
+ agentv?: string | undefined;
1873
+ }>>;
1874
+ }, "strip", z.ZodTypeAny, {
1875
+ name: string;
1876
+ description?: string | undefined;
1877
+ version?: string | undefined;
1878
+ author?: string | undefined;
1879
+ tags?: string[] | undefined;
1880
+ license?: string | undefined;
1881
+ requires?: {
1882
+ agentv?: string | undefined;
1883
+ } | undefined;
1884
+ }, {
1885
+ name: string;
1886
+ description?: string | undefined;
1887
+ version?: string | undefined;
1888
+ author?: string | undefined;
1889
+ tags?: string[] | undefined;
1890
+ license?: string | undefined;
1891
+ requires?: {
1892
+ agentv?: string | undefined;
1893
+ } | undefined;
1894
+ }>;
1895
+ type EvalMetadata = z.infer<typeof MetadataSchema>;
1896
+
1897
+ declare const DEFAULT_EVAL_PATTERNS: readonly string[];
1898
+ type ExecutionDefaults = {
1899
+ readonly verbose?: boolean;
1900
+ readonly keep_workspaces?: boolean;
1901
+ readonly otel_file?: string;
1902
+ readonly export_otel?: boolean;
1903
+ readonly otel_backend?: string;
1904
+ readonly otel_capture_content?: boolean;
1905
+ readonly otel_group_turns?: boolean;
1906
+ readonly pool_workspaces?: boolean;
1907
+ readonly pool_slots?: number;
1908
+ };
1909
+ type ResultsExportConfig = {
1910
+ readonly repo: string;
1911
+ readonly path: string;
1912
+ readonly auto_push?: boolean;
1913
+ readonly branch_prefix?: string;
1914
+ };
1915
+ type AgentVConfig$1 = {
1916
+ readonly required_version?: string;
1917
+ readonly eval_patterns?: readonly string[];
1918
+ readonly execution?: ExecutionDefaults;
1919
+ readonly results?: {
1920
+ readonly export?: ResultsExportConfig;
1921
+ };
1922
+ };
1923
+ /**
1924
+ * Load optional .agentv/config.yaml configuration file.
1925
+ * Searches from eval file directory up to repo root.
1926
+ */
1927
+ declare function loadConfig(evalFilePath: string, repoRoot: string): Promise<AgentVConfig$1 | null>;
1928
+ /**
1929
+ * Extract target name from parsed eval suite (checks execution.target then falls back to root-level target).
1930
+ */
1931
+ declare function extractTargetFromSuite(suite: JsonObject): string | undefined;
1932
+ /**
1933
+ * Extract target refs from parsed eval suite.
1934
+ * Supports both string shorthand and object form with hooks.
1935
+ * Returns undefined when no targets array is specified.
1936
+ */
1937
+ declare function extractTargetRefsFromSuite(suite: JsonObject): readonly EvalTargetRef[] | undefined;
1938
+ /**
1939
+ * Extract target names from parsed eval suite (backward-compat wrapper).
1940
+ * Precedence: execution.targets (array) > execution.target (singular).
1941
+ * Returns undefined when no targets array is specified.
1942
+ */
1943
+ declare function extractTargetsFromSuite(suite: JsonObject): readonly string[] | undefined;
1944
+ /**
1945
+ * Extract workers count from suite-level execution block.
1946
+ */
1947
+ declare function extractWorkersFromSuite(suite: JsonObject): number | undefined;
1948
+ /**
1949
+ * Extract per-test targets array from a raw test case object.
1950
+ */
1951
+ declare function extractTargetsFromTestCase(testCase: JsonObject): readonly string[] | undefined;
1952
+ /**
1953
+ * Extract trials configuration from parsed eval suite's execution block.
1954
+ * Returns undefined when count is 1 or not specified (no-op).
1955
+ */
1956
+ declare function extractTrialsConfig(suite: JsonObject): TrialsConfig | undefined;
1957
+ /**
1958
+ * Cache configuration parsed from execution block.
1959
+ */
1960
+ interface CacheConfig {
1961
+ readonly enabled: boolean;
1962
+ readonly cachePath?: string;
1963
+ }
1964
+ /**
1965
+ * Extract cache configuration from parsed eval suite's execution block.
1966
+ * Returns undefined when no cache config is specified.
1967
+ */
1968
+ declare function extractCacheConfig(suite: JsonObject): CacheConfig | undefined;
1969
+ /**
1970
+ * Extract `execution.fail_on_error` from parsed eval suite.
1971
+ * Accepts `true` or `false`.
1972
+ * Returns undefined when not specified.
1973
+ */
1974
+ declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
1975
+ /**
1976
+ * Extract `execution.threshold` from parsed eval suite.
1977
+ * Accepts a number in [0, 1] range.
1978
+ * Returns undefined when not specified.
1979
+ */
1980
+ declare function extractThreshold(suite: JsonObject): number | undefined;
1981
+
1982
+ /**
1983
+ * Formatting mode for segment content.
1984
+ * - 'agent': File references only (for providers with filesystem access)
1985
+ * - 'lm': Embedded file content with XML tags (for language model providers)
1986
+ */
1987
+ type FormattingMode = 'agent' | 'lm';
1988
+
1989
+ /**
1990
+ * Build prompt inputs by consolidating user request context.
1991
+ */
1992
+ interface PromptInputs {
1993
+ readonly question: string;
1994
+ readonly chatPrompt?: ChatPrompt;
1995
+ readonly systemMessage?: string;
1996
+ }
1997
+ /**
1998
+ * Build prompt inputs by consolidating user request context.
1999
+ *
2000
+ * @param testCase - The evaluation test case
2001
+ * @param mode - Formatting mode: 'agent' for file references, 'lm' for embedded content (default: 'lm')
2002
+ */
2003
+ declare function buildPromptInputs(testCase: EvalTest, mode?: FormattingMode): Promise<PromptInputs>;
2004
+
2005
+ /**
2006
+ * Detect file format by extension.
2007
+ */
2008
+ declare function detectFormat(filePath: string): 'yaml' | 'jsonl' | 'agent-skills-json' | 'typescript';
2009
+
1608
2010
  type LoadOptions = {
1609
2011
  readonly verbose?: boolean;
1610
2012
  /** Filter tests by ID pattern(s) (glob supported, e.g., "summary-*"). Arrays use OR logic. */
@@ -1647,6 +2049,10 @@ type EvalSuiteResult = {
1647
2049
  readonly threshold?: number;
1648
2050
  /** Resolved workspace.path from the eval YAML (after env-var expansion), if set */
1649
2051
  readonly workspacePath?: string;
2052
+ /** Inline target definition from a TS eval config. */
2053
+ readonly inlineTarget?: TargetDefinition;
2054
+ /** Custom provider factory from a TS eval config task(). */
2055
+ readonly providerFactory?: ProviderFactoryFn;
1650
2056
  };
1651
2057
  /**
1652
2058
  * Load tests and suite metadata from a single parse.
@@ -1693,495 +2099,370 @@ declare function isAgentSkillsFormat(parsed: unknown): parsed is AgentSkillsEval
1693
2099
  declare function parseAgentSkillsEvals(parsed: unknown, source?: string, baseDir?: string): readonly EvalTest[];
1694
2100
 
1695
2101
  /**
1696
- * EVAL.yaml evals.json transpiler.
2102
+ * Types for inline assertion functions used in the evaluate() API.
1697
2103
  *
1698
- * Converts an AgentV EVAL.yaml file into Agent Skills evals.json format
1699
- * for consumption by the skill-creator pipeline.
2104
+ * Inline functions are the escape hatch for custom evaluation logic
2105
+ * that doesn't fit a built-in grader type. For built-in assertions
2106
+ * (contains, regex, is-json, etc.), use config objects instead:
1700
2107
  *
1701
- * Handles both `assertions:` (current) and `assert:` (deprecated alias).
1702
- */
1703
- interface EvalsJsonCase {
1704
- id: number;
1705
- prompt: string;
1706
- expected_output?: string;
1707
- files?: string[];
1708
- should_trigger?: boolean;
1709
- assertions: string[];
1710
- }
1711
- interface EvalsJsonFile {
1712
- skill_name: string;
1713
- evals: EvalsJsonCase[];
1714
- }
1715
- /**
1716
- * Result of transpiling a single EVAL.yaml.
1717
- * May produce multiple evals.json files (one per skill).
1718
- */
1719
- interface TranspileResult {
1720
- /** Map from skill_name → EvalsJsonFile */
1721
- files: Map<string, EvalsJsonFile>;
1722
- /** Warning messages accumulated during transpilation */
1723
- warnings: string[];
1724
- }
1725
- /**
1726
- * Transpile a parsed EVAL.yaml object into one or more evals.json objects.
2108
+ * assert: [{ type: 'contains', value: 'hello' }]
1727
2109
  *
1728
- * @param suite Parsed YAML object (already loaded, no file I/O here)
1729
- * @param source Source identifier for error messages (e.g. file path)
1730
- */
1731
- declare function transpileEvalYaml(suite: unknown, source?: string): TranspileResult;
1732
- /**
1733
- * Transpile an EVAL.yaml file into one or more evals.json objects.
1734
- * Returns a map from output filename → JSON content.
2110
+ * Inline functions are for custom logic:
1735
2111
  *
1736
- * @param evalYamlPath Absolute path to the EVAL.yaml file
1737
- */
1738
- declare function transpileEvalYamlFile(evalYamlPath: string): TranspileResult;
1739
- /**
1740
- * Determine the output filename(s) for a transpile result.
1741
- * Single skill → "evals.json"
1742
- * Multiple skills → "<skill>.evals.json"
1743
- */
1744
- declare function getOutputFilenames(result: TranspileResult): Map<string, string>;
1745
-
1746
- declare function fileExists(filePath: string): Promise<boolean>;
1747
- /**
1748
- * Normalize line endings to LF (\n).
1749
- * This ensures consistent behavior across Windows (CRLF) and Unix (LF) systems.
1750
- */
1751
- declare function normalizeLineEndings(content: string): string;
1752
- /**
1753
- * Read a text file and normalize line endings to LF (\n).
1754
- * This ensures consistent behavior across Windows (CRLF) and Unix (LF) systems.
1755
- */
1756
- declare function readTextFile(filePath: string): Promise<string>;
1757
- /**
1758
- * Read a JSON file and parse it.
1759
- */
1760
- declare function readJsonFile<T = unknown>(filePath: string): Promise<T>;
1761
- /**
1762
- * Find git repository root by walking up the directory tree.
1763
- */
1764
- declare function findGitRoot(startPath: string): Promise<string | null>;
1765
- /**
1766
- * Build a chain of directories walking from a file's location up to repo root.
1767
- * Used for discovering configuration files like targets.yaml or config.yaml.
1768
- */
1769
- declare function buildDirectoryChain(filePath: string, repoRoot: string): readonly string[];
1770
- /**
1771
- * Build search roots for file resolution, matching yaml-parser behavior.
1772
- * Searches from eval file directory up to repo root.
1773
- */
1774
- declare function buildSearchRoots(evalPath: string, repoRoot: string): readonly string[];
1775
- /**
1776
- * Resolve a file reference using search roots, matching yaml-parser behavior.
2112
+ * assert: [({ output }) => ({ name: 'len', score: output.length > 5 ? 1 : 0 })]
1777
2113
  */
1778
- declare function resolveFileReference(rawValue: string, searchRoots: readonly string[]): Promise<{
1779
- readonly displayPath: string;
1780
- readonly resolvedPath?: string;
1781
- readonly attempted: readonly string[];
1782
- }>;
2114
+ /** Context passed to inline assertion functions */
2115
+ interface AssertContext {
2116
+ readonly input: string;
2117
+ readonly output: string;
2118
+ readonly expectedOutput?: string;
2119
+ readonly criteria?: string;
2120
+ readonly metadata?: Record<string, unknown>;
2121
+ }
2122
+ /** Result from an inline assertion function */
2123
+ interface AssertResult {
2124
+ readonly name: string;
2125
+ readonly score: number;
2126
+ readonly metadata?: Record<string, unknown>;
2127
+ }
2128
+ /** Inline assertion function signature */
2129
+ type AssertFn = (ctx: AssertContext) => AssertResult | Promise<AssertResult>;
1783
2130
 
1784
2131
  /**
1785
- * Strict normalized schema for CLI target configuration.
1786
- * This is the final validated shape after environment variable resolution
1787
- * and internal field normalization.
2132
+ * Programmatic API for running evaluations.
1788
2133
  *
1789
- * Uses .strict() to reject unknown properties, ensuring configuration
1790
- * errors are caught early rather than silently ignored.
2134
+ * Provides `evaluate()` a high-level function for using AgentV as a library
2135
+ * instead of a CLI. The config shape mirrors the YAML structure for easy
2136
+ * translation between file-based and programmatic usage.
1791
2137
  *
1792
- * @example
2138
+ * @example Inline tests with config objects
1793
2139
  * ```typescript
1794
- * const config: CliNormalizedConfig = {
1795
- * command: 'agent run {PROMPT}',
1796
- * timeoutMs: 120000,
1797
- * verbose: true,
1798
- * };
1799
- * CliTargetConfigSchema.parse(config); // Validates the normalized config
2140
+ * import { evaluate } from '@agentv/core';
2141
+ *
2142
+ * const results = await evaluate({
2143
+ * tests: [
2144
+ * {
2145
+ * id: 'capital',
2146
+ * input: 'What is the capital of France?',
2147
+ * expectedOutput: 'Paris',
2148
+ * assert: [{ type: 'contains', value: 'Paris' }],
2149
+ * },
2150
+ * ],
2151
+ * target: { provider: 'mock_agent' },
2152
+ * });
2153
+ *
2154
+ * console.log(results.summary.passed, 'passed');
2155
+ * ```
2156
+ *
2157
+ * @example Inline tests with task function and custom assertion
2158
+ * ```typescript
2159
+ * import { evaluate } from '@agentv/core';
2160
+ *
2161
+ * const { summary } = await evaluate({
2162
+ * tests: [
2163
+ * {
2164
+ * id: 'echo',
2165
+ * input: 'hello',
2166
+ * expectedOutput: 'Echo: hello',
2167
+ * assert: [
2168
+ * { type: 'contains', value: 'hello' },
2169
+ * { type: 'equals' },
2170
+ * ({ output }) => ({ name: 'custom', score: output.length > 0 ? 1 : 0 }),
2171
+ * ],
2172
+ * },
2173
+ * ],
2174
+ * task: async (input) => `Echo: ${input}`,
2175
+ * });
1800
2176
  * ```
1801
- */
1802
- declare const CliTargetConfigSchema: z.ZodObject<{
1803
- command: z.ZodString;
1804
- filesFormat: z.ZodOptional<z.ZodString>;
1805
- cwd: z.ZodOptional<z.ZodString>;
1806
- timeoutMs: z.ZodOptional<z.ZodNumber>;
1807
- healthcheck: z.ZodOptional<z.ZodUnion<[z.ZodObject<{
1808
- url: z.ZodString;
1809
- timeoutMs: z.ZodOptional<z.ZodNumber>;
1810
- }, "strict", z.ZodTypeAny, {
1811
- url: string;
1812
- timeoutMs?: number | undefined;
1813
- }, {
1814
- url: string;
1815
- timeoutMs?: number | undefined;
1816
- }>, z.ZodObject<{
1817
- command: z.ZodString;
1818
- cwd: z.ZodOptional<z.ZodString>;
1819
- timeoutMs: z.ZodOptional<z.ZodNumber>;
1820
- }, "strict", z.ZodTypeAny, {
1821
- command: string;
1822
- timeoutMs?: number | undefined;
1823
- cwd?: string | undefined;
1824
- }, {
1825
- command: string;
1826
- timeoutMs?: number | undefined;
1827
- cwd?: string | undefined;
1828
- }>]>>;
1829
- verbose: z.ZodOptional<z.ZodBoolean>;
1830
- keepTempFiles: z.ZodOptional<z.ZodBoolean>;
1831
- }, "strict", z.ZodTypeAny, {
1832
- command: string;
1833
- timeoutMs?: number | undefined;
1834
- cwd?: string | undefined;
1835
- verbose?: boolean | undefined;
1836
- healthcheck?: {
1837
- url: string;
1838
- timeoutMs?: number | undefined;
1839
- } | {
1840
- command: string;
1841
- timeoutMs?: number | undefined;
1842
- cwd?: string | undefined;
1843
- } | undefined;
1844
- filesFormat?: string | undefined;
1845
- keepTempFiles?: boolean | undefined;
1846
- }, {
1847
- command: string;
1848
- timeoutMs?: number | undefined;
1849
- cwd?: string | undefined;
1850
- verbose?: boolean | undefined;
1851
- healthcheck?: {
1852
- url: string;
1853
- timeoutMs?: number | undefined;
1854
- } | {
1855
- command: string;
1856
- timeoutMs?: number | undefined;
1857
- cwd?: string | undefined;
1858
- } | undefined;
1859
- filesFormat?: string | undefined;
1860
- keepTempFiles?: boolean | undefined;
1861
- }>;
1862
- type CliNormalizedConfig = z.infer<typeof CliTargetConfigSchema>;
1863
- /**
1864
- * Resolved CLI configuration type derived from CliTargetConfigSchema.
1865
- * This is the final validated shape used by the CLI provider at runtime.
1866
- * Using Readonly to ensure immutability for runtime safety.
1867
- */
1868
- type CliResolvedConfig = Readonly<CliNormalizedConfig>;
1869
- interface RetryConfig {
1870
- readonly maxRetries?: number;
1871
- readonly initialDelayMs?: number;
1872
- readonly maxDelayMs?: number;
1873
- readonly backoffFactor?: number;
1874
- readonly retryableStatusCodes?: readonly number[];
1875
- }
1876
- /**
1877
- * Selects which OpenAI-compatible API endpoint to use.
1878
- * - "chat" (default): POST /chat/completions — universally supported by all OpenAI-compatible providers.
1879
- * - "responses": POST /responses — only supported by api.openai.com.
1880
2177
  *
1881
- * Maps to Vercel AI SDK methods: "chat" → provider.chat(model), "responses" → provider(model).
2178
+ * @example File-based
2179
+ * ```typescript
2180
+ * const results = await evaluate({
2181
+ * specFile: './evals/EVAL.yaml',
2182
+ * target: { provider: 'claude_agent' },
2183
+ * });
2184
+ * ```
2185
+ *
2186
+ * @module
1882
2187
  */
1883
- type ApiFormat = 'chat' | 'responses';
2188
+
1884
2189
  /**
1885
- * Azure OpenAI settings used by the Vercel AI SDK.
2190
+ * Inline test definition for the programmatic API.
2191
+ * Mirrors the YAML test structure.
1886
2192
  */
1887
- interface AzureResolvedConfig {
1888
- readonly resourceName: string;
1889
- readonly deploymentName: string;
1890
- readonly apiKey: string;
1891
- readonly version?: string;
1892
- readonly apiFormat?: ApiFormat;
1893
- readonly temperature?: number;
1894
- readonly maxOutputTokens?: number;
1895
- readonly retry?: RetryConfig;
2193
+ interface EvalTestInput {
2194
+ /** Unique test identifier */
2195
+ readonly id: string;
2196
+ /** What the response should accomplish */
2197
+ readonly criteria?: string;
2198
+ /** Input to the agent (string or message array). Omit when using turns[]. */
2199
+ readonly input?: string | readonly {
2200
+ role: string;
2201
+ content: string;
2202
+ }[];
2203
+ /** Expected reference output (camelCase preferred) */
2204
+ readonly expectedOutput?: string;
2205
+ /** @deprecated Use `expectedOutput` instead */
2206
+ readonly expected_output?: string;
2207
+ /** Assertion graders — accepts factory functions, config objects, or inline functions */
2208
+ readonly assert?: readonly AssertEntry[];
2209
+ /** Arbitrary metadata */
2210
+ readonly metadata?: Record<string, unknown>;
2211
+ /** Enable multi-turn conversation mode. Inferred automatically when turns[] is provided. */
2212
+ readonly mode?: 'conversation';
2213
+ /** Ordered turns for conversation evaluation. Each turn generates a fresh LLM call. */
2214
+ readonly turns?: readonly ConversationTurnInput[];
2215
+ /** Score aggregation across turns: 'mean' (default), 'min', or 'max'. */
2216
+ readonly aggregation?: ConversationAggregation;
1896
2217
  }
1897
2218
  /**
1898
- * OpenAI-compatible settings used by the Vercel AI SDK.
2219
+ * A single turn in a multi-turn conversation evaluation (programmatic API).
2220
+ * Mirrors the YAML `turns` structure with camelCase naming.
1899
2221
  */
1900
- interface OpenAIResolvedConfig {
1901
- readonly baseURL: string;
1902
- readonly apiKey: string;
1903
- readonly model: string;
1904
- readonly apiFormat?: ApiFormat;
1905
- readonly temperature?: number;
1906
- readonly maxOutputTokens?: number;
1907
- readonly retry?: RetryConfig;
2222
+ interface ConversationTurnInput {
2223
+ /** Input for this turn (string or message array) */
2224
+ readonly input: string | readonly {
2225
+ role: string;
2226
+ content: string;
2227
+ }[];
2228
+ /** Expected reference output for this turn */
2229
+ readonly expectedOutput?: string;
2230
+ /** @deprecated Use `expectedOutput` instead */
2231
+ readonly expected_output?: string;
2232
+ /** Per-turn assertions (string criteria or grader config) */
2233
+ readonly assert?: readonly AssertEntry[];
1908
2234
  }
1909
2235
  /**
1910
- * OpenRouter settings used by the Vercel AI SDK provider.
2236
+ * Inline assertion definition for the programmatic API.
2237
+ * Matches the YAML `assert` block structure.
1911
2238
  */
1912
- interface OpenRouterResolvedConfig {
1913
- readonly apiKey: string;
1914
- readonly model: string;
1915
- readonly temperature?: number;
1916
- readonly maxOutputTokens?: number;
1917
- readonly retry?: RetryConfig;
2239
+ interface EvalAssertionInput {
2240
+ /** Assertion type (e.g., 'contains', 'llm-grader', 'code-grader') */
2241
+ readonly type: string;
2242
+ /** Display name */
2243
+ readonly name?: string;
2244
+ /** Value for deterministic assertions (contains, equals, regex) */
2245
+ readonly value?: string;
2246
+ /** Weight for scoring */
2247
+ readonly weight?: number;
2248
+ /** Whether this assertion is required to pass */
2249
+ readonly required?: boolean | number;
2250
+ /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
2251
+ readonly min_score?: number;
2252
+ /** Prompt file for llm_grader */
2253
+ readonly prompt?: string;
2254
+ /** Script for code_grader */
2255
+ readonly script?: string | readonly string[];
2256
+ /** Additional config passed to the assertion */
2257
+ readonly config?: Record<string, unknown>;
2258
+ /** Nested assertions for composite type */
2259
+ readonly assert?: readonly EvalAssertionInput[];
2260
+ /** Rubric criteria for rubrics type */
2261
+ readonly criteria?: readonly (string | {
2262
+ id?: string;
2263
+ outcome: string;
2264
+ weight?: number;
2265
+ })[];
2266
+ /** Additional properties */
2267
+ readonly [key: string]: unknown;
1918
2268
  }
2269
+ /** Assert entry: inline function or config object */
2270
+ type AssertEntry = AssertFn | EvalAssertionInput;
1919
2271
  /**
1920
- * Anthropic Claude settings used by the Vercel AI SDK.
2272
+ * Configuration for `evaluate()`.
2273
+ * Accepts either inline tests or a spec file path.
1921
2274
  */
1922
- interface AnthropicResolvedConfig {
1923
- readonly apiKey: string;
1924
- readonly model: string;
1925
- readonly temperature?: number;
1926
- readonly maxOutputTokens?: number;
1927
- readonly thinkingBudget?: number;
1928
- readonly retry?: RetryConfig;
2275
+ interface EvalConfig {
2276
+ /** Inline test definitions (mutually exclusive with specFile) */
2277
+ readonly tests?: readonly EvalTestInput[];
2278
+ /** Path to an EVAL.yaml spec file (mutually exclusive with tests) */
2279
+ readonly specFile?: string;
2280
+ /** Target provider configuration */
2281
+ readonly target?: TargetDefinition;
2282
+ /** Custom task function — mutually exclusive with target */
2283
+ readonly task?: (input: string) => string | Promise<string>;
2284
+ /** Suite-level assertions applied to all tests */
2285
+ readonly assert?: readonly AssertEntry[];
2286
+ /** Optional suite metadata used by CLI discovery, tagging, and reporting. */
2287
+ readonly metadata?: EvalMetadata;
2288
+ /** Filter tests by ID pattern(s) (glob supported). Arrays use OR logic. */
2289
+ readonly filter?: string | readonly string[];
2290
+ /** Maximum concurrent workers (default: 3) */
2291
+ readonly workers?: number;
2292
+ /** Maximum retries on failure (default: 2) */
2293
+ readonly maxRetries?: number;
2294
+ /** Agent timeout in milliseconds. No timeout if not set. */
2295
+ readonly agentTimeoutMs?: number;
2296
+ /** Enable response caching */
2297
+ readonly cache?: boolean;
2298
+ /** Verbose logging */
2299
+ readonly verbose?: boolean;
2300
+ /** Callback for each completed result */
2301
+ readonly onResult?: (result: EvaluationResult) => void;
2302
+ /** Score threshold for pass/fail (0-1). Default: 0.8 (DEFAULT_THRESHOLD). */
2303
+ readonly threshold?: number;
2304
+ /** Command(s) to run once before the suite starts. Same semantics as YAML before_all. */
2305
+ readonly beforeAll?: string | readonly string[];
2306
+ /** Suite-level cost cap in USD. Stops dispatching new tests when exceeded. */
2307
+ readonly budgetUsd?: number;
1929
2308
  }
1930
2309
  /**
1931
- * Google Gemini settings used by the Vercel AI SDK.
2310
+ * Summary statistics for an evaluation run.
1932
2311
  */
1933
- interface GeminiResolvedConfig {
1934
- readonly apiKey: string;
1935
- readonly model: string;
1936
- readonly temperature?: number;
1937
- readonly maxOutputTokens?: number;
1938
- readonly retry?: RetryConfig;
1939
- }
1940
- interface CodexResolvedConfig {
1941
- readonly model?: string;
1942
- readonly executable: string;
1943
- readonly args?: readonly string[];
1944
- readonly cwd?: string;
1945
- readonly timeoutMs?: number;
1946
- readonly logDir?: string;
1947
- readonly logFormat?: 'summary' | 'json';
1948
- /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
1949
- readonly streamLog?: false | 'raw' | 'summary';
1950
- readonly systemPrompt?: string;
1951
- }
1952
- interface CopilotCliResolvedConfig {
1953
- readonly executable: string;
1954
- readonly model?: string;
1955
- readonly args?: readonly string[];
1956
- readonly cwd?: string;
1957
- readonly timeoutMs?: number;
1958
- readonly logDir?: string;
1959
- readonly logFormat?: 'summary' | 'json';
1960
- /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
1961
- readonly streamLog?: false | 'raw' | 'summary';
1962
- readonly systemPrompt?: string;
1963
- }
1964
- interface CopilotSdkResolvedConfig {
1965
- readonly cliUrl?: string;
1966
- readonly cliPath?: string;
1967
- readonly githubToken?: string;
1968
- readonly model?: string;
1969
- readonly cwd?: string;
1970
- readonly timeoutMs?: number;
1971
- readonly logDir?: string;
1972
- readonly logFormat?: 'summary' | 'json';
1973
- /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
1974
- readonly streamLog?: false | 'raw' | 'summary';
1975
- readonly systemPrompt?: string;
1976
- /** BYOK provider type: "azure", "openai", or "anthropic". */
1977
- readonly byokType?: string;
1978
- /** BYOK base URL for the provider endpoint. */
1979
- readonly byokBaseUrl?: string;
1980
- /** BYOK API key for authenticating with the provider. */
1981
- readonly byokApiKey?: string;
1982
- /** BYOK bearer token (takes precedence over apiKey when set). */
1983
- readonly byokBearerToken?: string;
1984
- /** BYOK Azure API version (e.g. "2024-10-21"). Only used when byokType is "azure". */
1985
- readonly byokApiVersion?: string;
1986
- /** BYOK wire API format: "completions" or "responses". */
1987
- readonly byokWireApi?: string;
1988
- }
1989
- interface CopilotLogResolvedConfig {
1990
- /** Explicit path to a session directory containing events.jsonl. */
1991
- readonly sessionDir?: string;
1992
- /** Session UUID — combined with sessionStateDir to build the path. */
1993
- readonly sessionId?: string;
1994
- /** Auto-discovery mode. 'latest' picks the most recent session. */
1995
- readonly discover?: 'latest';
1996
- /** Override the default ~/.copilot/session-state directory. */
1997
- readonly sessionStateDir?: string;
1998
- /** Filter discovery by working directory. */
1999
- readonly cwd?: string;
2000
- }
2001
- interface PiCodingAgentResolvedConfig {
2002
- readonly subprovider?: string;
2003
- readonly model?: string;
2004
- readonly apiKey?: string;
2005
- readonly baseUrl?: string;
2006
- readonly tools?: string;
2007
- readonly thinking?: string;
2008
- readonly cwd?: string;
2009
- readonly timeoutMs?: number;
2010
- readonly logDir?: string;
2011
- readonly logFormat?: 'summary' | 'json';
2012
- /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
2013
- readonly streamLog?: false | 'raw' | 'summary';
2014
- readonly systemPrompt?: string;
2015
- }
2016
- interface PiCliResolvedConfig {
2017
- readonly executable: string;
2018
- readonly subprovider?: string;
2019
- readonly model?: string;
2020
- readonly apiKey?: string;
2021
- readonly baseUrl?: string;
2022
- readonly tools?: string;
2023
- readonly thinking?: string;
2024
- readonly args?: readonly string[];
2025
- readonly cwd?: string;
2026
- readonly timeoutMs?: number;
2027
- readonly logDir?: string;
2028
- readonly logFormat?: 'summary' | 'json';
2029
- /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
2030
- readonly streamLog?: false | 'raw' | 'summary';
2031
- readonly systemPrompt?: string;
2312
+ interface EvalSummary {
2313
+ /** Total number of test cases */
2314
+ readonly total: number;
2315
+ /** Number of passing test cases (score >= threshold) */
2316
+ readonly passed: number;
2317
+ /** Number of failing test cases (score < threshold) */
2318
+ readonly failed: number;
2319
+ /** Total duration in milliseconds */
2320
+ readonly durationMs: number;
2321
+ /** Mean score across all cases */
2322
+ readonly meanScore: number;
2032
2323
  }
2033
- interface ClaudeResolvedConfig {
2034
- readonly executable: string;
2035
- readonly model?: string;
2036
- readonly systemPrompt?: string;
2037
- readonly cwd?: string;
2038
- readonly timeoutMs?: number;
2039
- readonly maxTurns?: number;
2040
- readonly maxBudgetUsd?: number;
2041
- readonly logDir?: string;
2042
- readonly logFormat?: 'summary' | 'json';
2043
- /** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
2044
- readonly streamLog?: false | 'raw' | 'summary';
2324
+ /**
2325
+ * Result of an `evaluate()` call.
2326
+ */
2327
+ interface EvalRunResult {
2328
+ /** Individual test case results */
2329
+ readonly results: readonly EvaluationResult[];
2330
+ /** Aggregate summary statistics */
2331
+ readonly summary: EvalSummary;
2045
2332
  }
2046
- interface MockResolvedConfig {
2047
- readonly response?: string;
2048
- readonly delayMs?: number;
2049
- readonly delayMinMs?: number;
2050
- readonly delayMaxMs?: number;
2333
+ /**
2334
+ * Run an evaluation suite against a target provider.
2335
+ *
2336
+ * Accepts either inline test definitions or a path to an EVAL.yaml spec file.
2337
+ * The config shape mirrors the YAML structure — users can translate between
2338
+ * file-based and programmatic usage 1:1.
2339
+ *
2340
+ * @param config - Evaluation configuration
2341
+ * @returns Typed evaluation results with summary statistics
2342
+ *
2343
+ * @example Inline tests with assertions
2344
+ * ```typescript
2345
+ * const { results, summary } = await evaluate({
2346
+ * tests: [
2347
+ * {
2348
+ * id: 'greeting',
2349
+ * input: 'Say hello',
2350
+ * assert: [{ type: 'contains', value: 'hello' }],
2351
+ * },
2352
+ * ],
2353
+ * target: { provider: 'mock_agent' },
2354
+ * });
2355
+ * console.log(`${summary.passed}/${summary.total} passed`);
2356
+ * ```
2357
+ *
2358
+ * @example Load from YAML
2359
+ * ```typescript
2360
+ * const { summary } = await evaluate({
2361
+ * specFile: './evals/my-eval.yaml',
2362
+ * filter: 'greeting-*',
2363
+ * });
2364
+ * ```
2365
+ */
2366
+ declare function evaluate(config: EvalConfig): Promise<EvalRunResult>;
2367
+
2368
+ interface TsEvalResult {
2369
+ readonly config: EvalConfig;
2370
+ readonly filePath: string;
2051
2371
  }
2052
- interface VSCodeResolvedConfig {
2053
- readonly executable: string;
2054
- readonly waitForResponse: boolean;
2055
- readonly dryRun: boolean;
2056
- readonly subagentRoot?: string;
2057
- readonly timeoutMs?: number;
2372
+ /**
2373
+ * Import a *.eval.ts file and extract the EvalConfig export.
2374
+ * Tries default, `config`, and `evalConfig` named exports in priority order.
2375
+ */
2376
+ declare function loadTsEvalFile(filePath: string): Promise<TsEvalResult>;
2377
+
2378
+ /**
2379
+ * EVAL.yaml → evals.json transpiler.
2380
+ *
2381
+ * Converts an AgentV EVAL.yaml file into Agent Skills evals.json format
2382
+ * for consumption by the skill-creator pipeline.
2383
+ *
2384
+ * Handles both `assertions:` (current) and `assert:` (deprecated alias).
2385
+ */
2386
+ interface EvalsJsonCase {
2387
+ id: number;
2388
+ prompt: string;
2389
+ expected_output?: string;
2390
+ files?: string[];
2391
+ should_trigger?: boolean;
2392
+ assertions: string[];
2058
2393
  }
2059
- interface AgentVResolvedConfig {
2060
- readonly model: string;
2061
- readonly temperature: number;
2394
+ interface EvalsJsonFile {
2395
+ skill_name: string;
2396
+ evals: EvalsJsonCase[];
2062
2397
  }
2063
- /** Base fields shared by all resolved targets. */
2064
- interface ResolvedTargetBase {
2065
- readonly name: string;
2066
- readonly graderTarget?: string;
2067
- readonly workers?: number;
2068
- readonly providerBatching?: boolean;
2069
- /**
2070
- * Whether this target can be executed via executor subagents in subagent mode.
2071
- * Defaults to `true` for all non-CLI providers. Set `false` in targets.yaml
2072
- * to force CLI invocation even in subagent mode.
2073
- */
2074
- readonly subagentModeAllowed?: boolean;
2075
- /**
2076
- * Ordered list of target names to try when the primary target fails after
2077
- * exhausting retries. Each fallback is attempted in order.
2078
- */
2079
- readonly fallbackTargets?: readonly string[];
2398
+ /**
2399
+ * Result of transpiling a single EVAL.yaml.
2400
+ * May produce multiple evals.json files (one per skill).
2401
+ */
2402
+ interface TranspileResult {
2403
+ /** Map from skill_name → EvalsJsonFile */
2404
+ files: Map<string, EvalsJsonFile>;
2405
+ /** Warning messages accumulated during transpilation */
2406
+ warnings: string[];
2080
2407
  }
2081
- type ResolvedTarget = (ResolvedTargetBase & {
2082
- readonly kind: 'openai';
2083
- readonly config: OpenAIResolvedConfig;
2084
- }) | (ResolvedTargetBase & {
2085
- readonly kind: 'openrouter';
2086
- readonly config: OpenRouterResolvedConfig;
2087
- }) | (ResolvedTargetBase & {
2088
- readonly kind: 'azure';
2089
- readonly config: AzureResolvedConfig;
2090
- }) | (ResolvedTargetBase & {
2091
- readonly kind: 'anthropic';
2092
- readonly config: AnthropicResolvedConfig;
2093
- }) | (ResolvedTargetBase & {
2094
- readonly kind: 'gemini';
2095
- readonly config: GeminiResolvedConfig;
2096
- }) | (ResolvedTargetBase & {
2097
- readonly kind: 'codex';
2098
- readonly config: CodexResolvedConfig;
2099
- }) | (ResolvedTargetBase & {
2100
- readonly kind: 'copilot-sdk';
2101
- readonly config: CopilotSdkResolvedConfig;
2102
- }) | (ResolvedTargetBase & {
2103
- readonly kind: 'copilot-cli';
2104
- readonly config: CopilotCliResolvedConfig;
2105
- }) | (ResolvedTargetBase & {
2106
- readonly kind: 'copilot-log';
2107
- readonly config: CopilotLogResolvedConfig;
2108
- }) | (ResolvedTargetBase & {
2109
- readonly kind: 'pi-coding-agent';
2110
- readonly config: PiCodingAgentResolvedConfig;
2111
- }) | (ResolvedTargetBase & {
2112
- readonly kind: 'pi-cli';
2113
- readonly config: PiCliResolvedConfig;
2114
- }) | (ResolvedTargetBase & {
2115
- readonly kind: 'claude';
2116
- readonly config: ClaudeResolvedConfig;
2117
- }) | (ResolvedTargetBase & {
2118
- readonly kind: 'claude-cli';
2119
- readonly config: ClaudeResolvedConfig;
2120
- }) | (ResolvedTargetBase & {
2121
- readonly kind: 'claude-sdk';
2122
- readonly config: ClaudeResolvedConfig;
2123
- }) | (ResolvedTargetBase & {
2124
- readonly kind: 'mock';
2125
- readonly config: MockResolvedConfig;
2126
- }) | (ResolvedTargetBase & {
2127
- readonly kind: 'vscode' | 'vscode-insiders';
2128
- readonly config: VSCodeResolvedConfig;
2129
- }) | (ResolvedTargetBase & {
2130
- readonly kind: 'agentv';
2131
- readonly config: AgentVResolvedConfig;
2132
- }) | (ResolvedTargetBase & {
2133
- readonly kind: 'cli';
2134
- readonly config: CliResolvedConfig;
2135
- }) | (ResolvedTargetBase & {
2136
- readonly kind: 'transcript';
2137
- readonly config: Record<string, never>;
2138
- });
2139
2408
  /**
2140
- * Optional settings accepted on ALL target definitions regardless of provider.
2141
- * Exported so the targets validator can reuse the same list — adding a field
2142
- * here automatically makes it valid in targets.yaml without a separate update.
2409
+ * Transpile a parsed EVAL.yaml object into one or more evals.json objects.
2410
+ *
2411
+ * @param suite Parsed YAML object (already loaded, no file I/O here)
2412
+ * @param source Source identifier for error messages (e.g. file path)
2143
2413
  */
2144
- declare const COMMON_TARGET_SETTINGS: readonly ["use_target", "provider_batching", "subagent_mode_allowed", "fallback_targets"];
2145
- declare function resolveDelegatedTargetDefinition(name: string, definitions: ReadonlyMap<string, TargetDefinition>, env?: EnvLookup): TargetDefinition | undefined;
2146
- declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string, options?: {
2147
- readonly emitDeprecationWarnings?: boolean;
2148
- }): ResolvedTarget;
2149
-
2414
+ declare function transpileEvalYaml(suite: unknown, source?: string): TranspileResult;
2150
2415
  /**
2151
- * Extensible provider registry.
2416
+ * Transpile an EVAL.yaml file into one or more evals.json objects.
2417
+ * Returns a map from output filename → JSON content.
2152
2418
  *
2153
- * Replaces the hardcoded switch/case dispatch in createProvider() with
2154
- * a registry of named factory functions. Built-in providers are registered
2155
- * at startup; users can add custom providers via the registry API or by
2156
- * dropping files in `.agentv/providers/`.
2419
+ * @param evalYamlPath Absolute path to the EVAL.yaml file
2420
+ */
2421
+ declare function transpileEvalYamlFile(evalYamlPath: string): TranspileResult;
2422
+ /**
2423
+ * Determine the output filename(s) for a transpile result.
2424
+ * Single skill → "evals.json"
2425
+ * Multiple skills → "<skill>.evals.json"
2157
2426
  */
2427
+ declare function getOutputFilenames(result: TranspileResult): Map<string, string>;
2158
2428
 
2429
+ declare function fileExists(filePath: string): Promise<boolean>;
2159
2430
  /**
2160
- * Factory function that creates a Provider instance from a resolved target.
2431
+ * Normalize line endings to LF (\n).
2432
+ * This ensures consistent behavior across Windows (CRLF) and Unix (LF) systems.
2161
2433
  */
2162
- type ProviderFactoryFn = (target: ResolvedTarget) => Provider;
2434
+ declare function normalizeLineEndings(content: string): string;
2163
2435
  /**
2164
- * Registry of provider factory functions keyed by provider kind.
2165
- *
2166
- * Built-in providers are registered at startup. Custom providers can be
2167
- * registered via the `register()` method.
2436
+ * Read a text file and normalize line endings to LF (\n).
2437
+ * This ensures consistent behavior across Windows (CRLF) and Unix (LF) systems.
2168
2438
  */
2169
- declare class ProviderRegistry {
2170
- private readonly factories;
2171
- /** Register a factory function for a provider kind. */
2172
- register(kind: string, factory: ProviderFactoryFn): this;
2173
- /** Get the factory function for a provider kind. */
2174
- get(kind: string): ProviderFactoryFn | undefined;
2175
- /** Check if a factory is registered for the given kind. */
2176
- has(kind: string): boolean;
2177
- /** List all registered provider kind names. */
2178
- list(): string[];
2179
- /**
2180
- * Create a provider instance from a resolved target.
2181
- * Falls back to CLI provider for unknown kinds (custom provider escape hatch).
2182
- */
2183
- create(target: ResolvedTarget): Provider;
2184
- }
2439
+ declare function readTextFile(filePath: string): Promise<string>;
2440
+ /**
2441
+ * Read a JSON file and parse it.
2442
+ */
2443
+ declare function readJsonFile<T = unknown>(filePath: string): Promise<T>;
2444
+ /**
2445
+ * Find git repository root by walking up the directory tree.
2446
+ */
2447
+ declare function findGitRoot(startPath: string): Promise<string | null>;
2448
+ /**
2449
+ * Build a chain of directories walking from a file's location up to repo root.
2450
+ * Used for discovering configuration files like targets.yaml or config.yaml.
2451
+ */
2452
+ declare function buildDirectoryChain(filePath: string, repoRoot: string): readonly string[];
2453
+ /**
2454
+ * Build search roots for file resolution, matching yaml-parser behavior.
2455
+ * Searches from eval file directory up to repo root.
2456
+ */
2457
+ declare function buildSearchRoots(evalPath: string, repoRoot: string): readonly string[];
2458
+ /**
2459
+ * Resolve a file reference using search roots, matching yaml-parser behavior.
2460
+ */
2461
+ declare function resolveFileReference(rawValue: string, searchRoots: readonly string[]): Promise<{
2462
+ readonly displayPath: string;
2463
+ readonly resolvedPath?: string;
2464
+ readonly attempted: readonly string[];
2465
+ }>;
2185
2466
 
2186
2467
  declare function readTargetDefinitions(filePath: string): Promise<readonly TargetDefinition[]>;
2187
2468
  declare function listTargetNames(definitions: readonly TargetDefinition[]): readonly string[];
@@ -2631,26 +2912,26 @@ declare const rubricEvaluationSchema: z.ZodObject<{
2631
2912
  reasoning: z.ZodString;
2632
2913
  }, "strip", z.ZodTypeAny, {
2633
2914
  id: string;
2634
- reasoning: string;
2635
2915
  satisfied: boolean;
2916
+ reasoning: string;
2636
2917
  }, {
2637
2918
  id: string;
2638
- reasoning: string;
2639
2919
  satisfied: boolean;
2920
+ reasoning: string;
2640
2921
  }>, "many">;
2641
2922
  overall_reasoning: z.ZodString;
2642
2923
  }, "strip", z.ZodTypeAny, {
2643
2924
  checks: {
2644
2925
  id: string;
2645
- reasoning: string;
2646
2926
  satisfied: boolean;
2927
+ reasoning: string;
2647
2928
  }[];
2648
2929
  overall_reasoning: string;
2649
2930
  }, {
2650
2931
  checks: {
2651
2932
  id: string;
2652
- reasoning: string;
2653
2933
  satisfied: boolean;
2934
+ reasoning: string;
2654
2935
  }[];
2655
2936
  overall_reasoning: string;
2656
2937
  }>;
@@ -3105,244 +3386,6 @@ interface RunEvaluationOptions {
3105
3386
  declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
3106
3387
  declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
3107
3388
 
3108
- /**
3109
- * Types for inline assertion functions used in the evaluate() API.
3110
- *
3111
- * Inline functions are the escape hatch for custom evaluation logic
3112
- * that doesn't fit a built-in grader type. For built-in assertions
3113
- * (contains, regex, is-json, etc.), use config objects instead:
3114
- *
3115
- * assert: [{ type: 'contains', value: 'hello' }]
3116
- *
3117
- * Inline functions are for custom logic:
3118
- *
3119
- * assert: [({ output }) => ({ name: 'len', score: output.length > 5 ? 1 : 0 })]
3120
- */
3121
- /** Context passed to inline assertion functions */
3122
- interface AssertContext {
3123
- readonly input: string;
3124
- readonly output: string;
3125
- readonly expectedOutput?: string;
3126
- readonly criteria?: string;
3127
- readonly metadata?: Record<string, unknown>;
3128
- }
3129
- /** Result from an inline assertion function */
3130
- interface AssertResult {
3131
- readonly name: string;
3132
- readonly score: number;
3133
- readonly metadata?: Record<string, unknown>;
3134
- }
3135
- /** Inline assertion function signature */
3136
- type AssertFn = (ctx: AssertContext) => AssertResult | Promise<AssertResult>;
3137
-
3138
- /**
3139
- * Programmatic API for running evaluations.
3140
- *
3141
- * Provides `evaluate()` — a high-level function for using AgentV as a library
3142
- * instead of a CLI. The config shape mirrors the YAML structure for easy
3143
- * translation between file-based and programmatic usage.
3144
- *
3145
- * @example Inline tests with config objects
3146
- * ```typescript
3147
- * import { evaluate } from '@agentv/core';
3148
- *
3149
- * const results = await evaluate({
3150
- * tests: [
3151
- * {
3152
- * id: 'capital',
3153
- * input: 'What is the capital of France?',
3154
- * expectedOutput: 'Paris',
3155
- * assert: [{ type: 'contains', value: 'Paris' }],
3156
- * },
3157
- * ],
3158
- * target: { provider: 'mock_agent' },
3159
- * });
3160
- *
3161
- * console.log(results.summary.passed, 'passed');
3162
- * ```
3163
- *
3164
- * @example Inline tests with task function and custom assertion
3165
- * ```typescript
3166
- * import { evaluate } from '@agentv/core';
3167
- *
3168
- * const { summary } = await evaluate({
3169
- * tests: [
3170
- * {
3171
- * id: 'echo',
3172
- * input: 'hello',
3173
- * expectedOutput: 'Echo: hello',
3174
- * assert: [
3175
- * { type: 'contains', value: 'hello' },
3176
- * { type: 'equals' },
3177
- * ({ output }) => ({ name: 'custom', score: output.length > 0 ? 1 : 0 }),
3178
- * ],
3179
- * },
3180
- * ],
3181
- * task: async (input) => `Echo: ${input}`,
3182
- * });
3183
- * ```
3184
- *
3185
- * @example File-based
3186
- * ```typescript
3187
- * const results = await evaluate({
3188
- * specFile: './evals/EVAL.yaml',
3189
- * target: { provider: 'claude_agent' },
3190
- * });
3191
- * ```
3192
- *
3193
- * @module
3194
- */
3195
-
3196
- /**
3197
- * Inline test definition for the programmatic API.
3198
- * Mirrors the YAML test structure.
3199
- */
3200
- interface EvalTestInput {
3201
- /** Unique test identifier */
3202
- readonly id: string;
3203
- /** What the response should accomplish */
3204
- readonly criteria?: string;
3205
- /** Input to the agent (string or message array) */
3206
- readonly input: string | readonly {
3207
- role: string;
3208
- content: string;
3209
- }[];
3210
- /** Expected reference output (camelCase preferred) */
3211
- readonly expectedOutput?: string;
3212
- /** @deprecated Use `expectedOutput` instead */
3213
- readonly expected_output?: string;
3214
- /** Assertion graders — accepts factory functions, config objects, or inline functions */
3215
- readonly assert?: readonly AssertEntry[];
3216
- /** Arbitrary metadata */
3217
- readonly metadata?: Record<string, unknown>;
3218
- }
3219
- /**
3220
- * Inline assertion definition for the programmatic API.
3221
- * Matches the YAML `assert` block structure.
3222
- */
3223
- interface EvalAssertionInput {
3224
- /** Assertion type (e.g., 'contains', 'llm-grader', 'code-grader') */
3225
- readonly type: string;
3226
- /** Display name */
3227
- readonly name?: string;
3228
- /** Value for deterministic assertions (contains, equals, regex) */
3229
- readonly value?: string;
3230
- /** Weight for scoring */
3231
- readonly weight?: number;
3232
- /** Whether this assertion is required to pass */
3233
- readonly required?: boolean | number;
3234
- /** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
3235
- readonly min_score?: number;
3236
- /** Prompt file for llm_grader */
3237
- readonly prompt?: string;
3238
- /** Script for code_grader */
3239
- readonly script?: string | readonly string[];
3240
- /** Additional config passed to the assertion */
3241
- readonly config?: Record<string, unknown>;
3242
- /** Nested assertions for composite type */
3243
- readonly assert?: readonly EvalAssertionInput[];
3244
- /** Rubric criteria for rubrics type */
3245
- readonly criteria?: readonly (string | {
3246
- id?: string;
3247
- outcome: string;
3248
- weight?: number;
3249
- })[];
3250
- /** Additional properties */
3251
- readonly [key: string]: unknown;
3252
- }
3253
- /** Assert entry: inline function or config object */
3254
- type AssertEntry = AssertFn | EvalAssertionInput;
3255
- /**
3256
- * Configuration for `evaluate()`.
3257
- * Accepts either inline tests or a spec file path.
3258
- */
3259
- interface EvalConfig {
3260
- /** Inline test definitions (mutually exclusive with specFile) */
3261
- readonly tests?: readonly EvalTestInput[];
3262
- /** Path to an EVAL.yaml spec file (mutually exclusive with tests) */
3263
- readonly specFile?: string;
3264
- /** Target provider configuration */
3265
- readonly target?: TargetDefinition;
3266
- /** Custom task function — mutually exclusive with target */
3267
- readonly task?: (input: string) => string | Promise<string>;
3268
- /** Suite-level assertions applied to all tests */
3269
- readonly assert?: readonly AssertEntry[];
3270
- /** Filter tests by ID pattern(s) (glob supported). Arrays use OR logic. */
3271
- readonly filter?: string | readonly string[];
3272
- /** Maximum concurrent workers (default: 3) */
3273
- readonly workers?: number;
3274
- /** Maximum retries on failure (default: 2) */
3275
- readonly maxRetries?: number;
3276
- /** Agent timeout in milliseconds. No timeout if not set. */
3277
- readonly agentTimeoutMs?: number;
3278
- /** Enable response caching */
3279
- readonly cache?: boolean;
3280
- /** Verbose logging */
3281
- readonly verbose?: boolean;
3282
- /** Callback for each completed result */
3283
- readonly onResult?: (result: EvaluationResult) => void;
3284
- /** Score threshold for pass/fail (0-1). Default: 0.8 (DEFAULT_THRESHOLD). */
3285
- readonly threshold?: number;
3286
- }
3287
- /**
3288
- * Summary statistics for an evaluation run.
3289
- */
3290
- interface EvalSummary {
3291
- /** Total number of test cases */
3292
- readonly total: number;
3293
- /** Number of passing test cases (score >= threshold) */
3294
- readonly passed: number;
3295
- /** Number of failing test cases (score < threshold) */
3296
- readonly failed: number;
3297
- /** Total duration in milliseconds */
3298
- readonly durationMs: number;
3299
- /** Mean score across all cases */
3300
- readonly meanScore: number;
3301
- }
3302
- /**
3303
- * Result of an `evaluate()` call.
3304
- */
3305
- interface EvalRunResult {
3306
- /** Individual test case results */
3307
- readonly results: readonly EvaluationResult[];
3308
- /** Aggregate summary statistics */
3309
- readonly summary: EvalSummary;
3310
- }
3311
- /**
3312
- * Run an evaluation suite against a target provider.
3313
- *
3314
- * Accepts either inline test definitions or a path to an EVAL.yaml spec file.
3315
- * The config shape mirrors the YAML structure — users can translate between
3316
- * file-based and programmatic usage 1:1.
3317
- *
3318
- * @param config - Evaluation configuration
3319
- * @returns Typed evaluation results with summary statistics
3320
- *
3321
- * @example Inline tests with assertions
3322
- * ```typescript
3323
- * const { results, summary } = await evaluate({
3324
- * tests: [
3325
- * {
3326
- * id: 'greeting',
3327
- * input: 'Say hello',
3328
- * assert: [{ type: 'contains', value: 'hello' }],
3329
- * },
3330
- * ],
3331
- * target: { provider: 'mock_agent' },
3332
- * });
3333
- * console.log(`${summary.passed}/${summary.total} passed`);
3334
- * ```
3335
- *
3336
- * @example Load from YAML
3337
- * ```typescript
3338
- * const { summary } = await evaluate({
3339
- * specFile: './evals/my-eval.yaml',
3340
- * filter: 'greeting-*',
3341
- * });
3342
- * ```
3343
- */
3344
- declare function evaluate(config: EvalConfig): Promise<EvalRunResult>;
3345
-
3346
3389
  /**
3347
3390
  * Typed configuration file support for AgentV.
3348
3391
  *
@@ -4553,4 +4596,4 @@ type AgentKernel = {
4553
4596
  };
4554
4597
  declare function createAgentKernel(): AgentKernel;
4555
4598
 
4556
- export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
4599
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };