@agentv/core 4.18.0-next.1 → 4.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-PYDBJOAO.js → chunk-24ND5HZC.js} +97 -97
- package/dist/chunk-24ND5HZC.js.map +1 -0
- package/dist/chunk-QXX3IBYV.js +19740 -0
- package/dist/chunk-QXX3IBYV.js.map +1 -0
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +20086 -19073
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +861 -818
- package/dist/index.d.ts +861 -818
- package/dist/index.js +479 -19769
- package/dist/index.js.map +1 -1
- package/dist/ts-eval-loader-XFQ6S4DT.js +12 -0
- package/dist/ts-eval-loader-XFQ6S4DT.js.map +1 -0
- package/package.json +1 -1
- package/dist/chunk-PYDBJOAO.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -1455,156 +1455,558 @@ interface GraderResult {
|
|
|
1455
1455
|
readonly endedAt?: string;
|
|
1456
1456
|
}
|
|
1457
1457
|
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1458
|
+
/**
|
|
1459
|
+
* Strict normalized schema for CLI target configuration.
|
|
1460
|
+
* This is the final validated shape after environment variable resolution
|
|
1461
|
+
* and internal field normalization.
|
|
1462
|
+
*
|
|
1463
|
+
* Uses .strict() to reject unknown properties, ensuring configuration
|
|
1464
|
+
* errors are caught early rather than silently ignored.
|
|
1465
|
+
*
|
|
1466
|
+
* @example
|
|
1467
|
+
* ```typescript
|
|
1468
|
+
* const config: CliNormalizedConfig = {
|
|
1469
|
+
* command: 'agent run {PROMPT}',
|
|
1470
|
+
* timeoutMs: 120000,
|
|
1471
|
+
* verbose: true,
|
|
1472
|
+
* };
|
|
1473
|
+
* CliTargetConfigSchema.parse(config); // Validates the normalized config
|
|
1474
|
+
* ```
|
|
1475
|
+
*/
|
|
1476
|
+
declare const CliTargetConfigSchema: z.ZodObject<{
|
|
1477
|
+
command: z.ZodString;
|
|
1478
|
+
filesFormat: z.ZodOptional<z.ZodString>;
|
|
1479
|
+
cwd: z.ZodOptional<z.ZodString>;
|
|
1480
|
+
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
1481
|
+
healthcheck: z.ZodOptional<z.ZodUnion<[z.ZodObject<{
|
|
1482
|
+
url: z.ZodString;
|
|
1483
|
+
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
1484
|
+
}, "strict", z.ZodTypeAny, {
|
|
1485
|
+
url: string;
|
|
1486
|
+
timeoutMs?: number | undefined;
|
|
1469
1487
|
}, {
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
}
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1488
|
+
url: string;
|
|
1489
|
+
timeoutMs?: number | undefined;
|
|
1490
|
+
}>, z.ZodObject<{
|
|
1491
|
+
command: z.ZodString;
|
|
1492
|
+
cwd: z.ZodOptional<z.ZodString>;
|
|
1493
|
+
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
1494
|
+
}, "strict", z.ZodTypeAny, {
|
|
1495
|
+
command: string;
|
|
1496
|
+
timeoutMs?: number | undefined;
|
|
1497
|
+
cwd?: string | undefined;
|
|
1498
|
+
}, {
|
|
1499
|
+
command: string;
|
|
1500
|
+
timeoutMs?: number | undefined;
|
|
1501
|
+
cwd?: string | undefined;
|
|
1502
|
+
}>]>>;
|
|
1503
|
+
verbose: z.ZodOptional<z.ZodBoolean>;
|
|
1504
|
+
keepTempFiles: z.ZodOptional<z.ZodBoolean>;
|
|
1505
|
+
}, "strict", z.ZodTypeAny, {
|
|
1506
|
+
command: string;
|
|
1507
|
+
timeoutMs?: number | undefined;
|
|
1508
|
+
cwd?: string | undefined;
|
|
1509
|
+
verbose?: boolean | undefined;
|
|
1510
|
+
healthcheck?: {
|
|
1511
|
+
url: string;
|
|
1512
|
+
timeoutMs?: number | undefined;
|
|
1513
|
+
} | {
|
|
1514
|
+
command: string;
|
|
1515
|
+
timeoutMs?: number | undefined;
|
|
1516
|
+
cwd?: string | undefined;
|
|
1481
1517
|
} | undefined;
|
|
1518
|
+
filesFormat?: string | undefined;
|
|
1519
|
+
keepTempFiles?: boolean | undefined;
|
|
1482
1520
|
}, {
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
1521
|
+
command: string;
|
|
1522
|
+
timeoutMs?: number | undefined;
|
|
1523
|
+
cwd?: string | undefined;
|
|
1524
|
+
verbose?: boolean | undefined;
|
|
1525
|
+
healthcheck?: {
|
|
1526
|
+
url: string;
|
|
1527
|
+
timeoutMs?: number | undefined;
|
|
1528
|
+
} | {
|
|
1529
|
+
command: string;
|
|
1530
|
+
timeoutMs?: number | undefined;
|
|
1531
|
+
cwd?: string | undefined;
|
|
1491
1532
|
} | undefined;
|
|
1533
|
+
filesFormat?: string | undefined;
|
|
1534
|
+
keepTempFiles?: boolean | undefined;
|
|
1492
1535
|
}>;
|
|
1493
|
-
type
|
|
1494
|
-
|
|
1495
|
-
declare const DEFAULT_EVAL_PATTERNS: readonly string[];
|
|
1496
|
-
type ExecutionDefaults = {
|
|
1497
|
-
readonly verbose?: boolean;
|
|
1498
|
-
readonly keep_workspaces?: boolean;
|
|
1499
|
-
readonly otel_file?: string;
|
|
1500
|
-
readonly export_otel?: boolean;
|
|
1501
|
-
readonly otel_backend?: string;
|
|
1502
|
-
readonly otel_capture_content?: boolean;
|
|
1503
|
-
readonly otel_group_turns?: boolean;
|
|
1504
|
-
readonly pool_workspaces?: boolean;
|
|
1505
|
-
readonly pool_slots?: number;
|
|
1506
|
-
};
|
|
1507
|
-
type ResultsExportConfig = {
|
|
1508
|
-
readonly repo: string;
|
|
1509
|
-
readonly path: string;
|
|
1510
|
-
readonly auto_push?: boolean;
|
|
1511
|
-
readonly branch_prefix?: string;
|
|
1512
|
-
};
|
|
1513
|
-
type AgentVConfig$1 = {
|
|
1514
|
-
readonly required_version?: string;
|
|
1515
|
-
readonly eval_patterns?: readonly string[];
|
|
1516
|
-
readonly execution?: ExecutionDefaults;
|
|
1517
|
-
readonly results?: {
|
|
1518
|
-
readonly export?: ResultsExportConfig;
|
|
1519
|
-
};
|
|
1520
|
-
};
|
|
1521
|
-
/**
|
|
1522
|
-
* Load optional .agentv/config.yaml configuration file.
|
|
1523
|
-
* Searches from eval file directory up to repo root.
|
|
1524
|
-
*/
|
|
1525
|
-
declare function loadConfig(evalFilePath: string, repoRoot: string): Promise<AgentVConfig$1 | null>;
|
|
1526
|
-
/**
|
|
1527
|
-
* Extract target name from parsed eval suite (checks execution.target then falls back to root-level target).
|
|
1528
|
-
*/
|
|
1529
|
-
declare function extractTargetFromSuite(suite: JsonObject): string | undefined;
|
|
1530
|
-
/**
|
|
1531
|
-
* Extract target refs from parsed eval suite.
|
|
1532
|
-
* Supports both string shorthand and object form with hooks.
|
|
1533
|
-
* Returns undefined when no targets array is specified.
|
|
1534
|
-
*/
|
|
1535
|
-
declare function extractTargetRefsFromSuite(suite: JsonObject): readonly EvalTargetRef[] | undefined;
|
|
1536
|
-
/**
|
|
1537
|
-
* Extract target names from parsed eval suite (backward-compat wrapper).
|
|
1538
|
-
* Precedence: execution.targets (array) > execution.target (singular).
|
|
1539
|
-
* Returns undefined when no targets array is specified.
|
|
1540
|
-
*/
|
|
1541
|
-
declare function extractTargetsFromSuite(suite: JsonObject): readonly string[] | undefined;
|
|
1542
|
-
/**
|
|
1543
|
-
* Extract workers count from suite-level execution block.
|
|
1544
|
-
*/
|
|
1545
|
-
declare function extractWorkersFromSuite(suite: JsonObject): number | undefined;
|
|
1546
|
-
/**
|
|
1547
|
-
* Extract per-test targets array from a raw test case object.
|
|
1548
|
-
*/
|
|
1549
|
-
declare function extractTargetsFromTestCase(testCase: JsonObject): readonly string[] | undefined;
|
|
1550
|
-
/**
|
|
1551
|
-
* Extract trials configuration from parsed eval suite's execution block.
|
|
1552
|
-
* Returns undefined when count is 1 or not specified (no-op).
|
|
1553
|
-
*/
|
|
1554
|
-
declare function extractTrialsConfig(suite: JsonObject): TrialsConfig | undefined;
|
|
1536
|
+
type CliNormalizedConfig = z.infer<typeof CliTargetConfigSchema>;
|
|
1555
1537
|
/**
|
|
1556
|
-
*
|
|
1538
|
+
* Resolved CLI configuration type derived from CliTargetConfigSchema.
|
|
1539
|
+
* This is the final validated shape used by the CLI provider at runtime.
|
|
1540
|
+
* Using Readonly to ensure immutability for runtime safety.
|
|
1557
1541
|
*/
|
|
1558
|
-
|
|
1559
|
-
|
|
1560
|
-
readonly
|
|
1542
|
+
type CliResolvedConfig = Readonly<CliNormalizedConfig>;
|
|
1543
|
+
interface RetryConfig {
|
|
1544
|
+
readonly maxRetries?: number;
|
|
1545
|
+
readonly initialDelayMs?: number;
|
|
1546
|
+
readonly maxDelayMs?: number;
|
|
1547
|
+
readonly backoffFactor?: number;
|
|
1548
|
+
readonly retryableStatusCodes?: readonly number[];
|
|
1561
1549
|
}
|
|
1562
1550
|
/**
|
|
1563
|
-
*
|
|
1564
|
-
*
|
|
1565
|
-
|
|
1566
|
-
|
|
1567
|
-
|
|
1568
|
-
* Extract `execution.fail_on_error` from parsed eval suite.
|
|
1569
|
-
* Accepts `true` or `false`.
|
|
1570
|
-
* Returns undefined when not specified.
|
|
1551
|
+
* Selects which OpenAI-compatible API endpoint to use.
|
|
1552
|
+
* - "chat" (default): POST /chat/completions — universally supported by all OpenAI-compatible providers.
|
|
1553
|
+
* - "responses": POST /responses — only supported by api.openai.com.
|
|
1554
|
+
*
|
|
1555
|
+
* Maps to Vercel AI SDK methods: "chat" → provider.chat(model), "responses" → provider(model).
|
|
1571
1556
|
*/
|
|
1572
|
-
|
|
1557
|
+
type ApiFormat = 'chat' | 'responses';
|
|
1573
1558
|
/**
|
|
1574
|
-
*
|
|
1575
|
-
* Accepts a number in [0, 1] range.
|
|
1576
|
-
* Returns undefined when not specified.
|
|
1559
|
+
* Azure OpenAI settings used by the Vercel AI SDK.
|
|
1577
1560
|
*/
|
|
1578
|
-
|
|
1579
|
-
|
|
1561
|
+
interface AzureResolvedConfig {
|
|
1562
|
+
readonly resourceName: string;
|
|
1563
|
+
readonly deploymentName: string;
|
|
1564
|
+
readonly apiKey: string;
|
|
1565
|
+
readonly version?: string;
|
|
1566
|
+
readonly apiFormat?: ApiFormat;
|
|
1567
|
+
readonly temperature?: number;
|
|
1568
|
+
readonly maxOutputTokens?: number;
|
|
1569
|
+
readonly retry?: RetryConfig;
|
|
1570
|
+
}
|
|
1580
1571
|
/**
|
|
1581
|
-
*
|
|
1582
|
-
* - 'agent': File references only (for providers with filesystem access)
|
|
1583
|
-
* - 'lm': Embedded file content with XML tags (for language model providers)
|
|
1572
|
+
* OpenAI-compatible settings used by the Vercel AI SDK.
|
|
1584
1573
|
*/
|
|
1585
|
-
|
|
1586
|
-
|
|
1574
|
+
interface OpenAIResolvedConfig {
|
|
1575
|
+
readonly baseURL: string;
|
|
1576
|
+
readonly apiKey: string;
|
|
1577
|
+
readonly model: string;
|
|
1578
|
+
readonly apiFormat?: ApiFormat;
|
|
1579
|
+
readonly temperature?: number;
|
|
1580
|
+
readonly maxOutputTokens?: number;
|
|
1581
|
+
readonly retry?: RetryConfig;
|
|
1582
|
+
}
|
|
1587
1583
|
/**
|
|
1588
|
-
*
|
|
1584
|
+
* OpenRouter settings used by the Vercel AI SDK provider.
|
|
1589
1585
|
*/
|
|
1590
|
-
interface
|
|
1591
|
-
readonly
|
|
1592
|
-
readonly
|
|
1593
|
-
readonly
|
|
1586
|
+
interface OpenRouterResolvedConfig {
|
|
1587
|
+
readonly apiKey: string;
|
|
1588
|
+
readonly model: string;
|
|
1589
|
+
readonly temperature?: number;
|
|
1590
|
+
readonly maxOutputTokens?: number;
|
|
1591
|
+
readonly retry?: RetryConfig;
|
|
1594
1592
|
}
|
|
1595
1593
|
/**
|
|
1596
|
-
*
|
|
1597
|
-
*
|
|
1598
|
-
* @param testCase - The evaluation test case
|
|
1599
|
-
* @param mode - Formatting mode: 'agent' for file references, 'lm' for embedded content (default: 'lm')
|
|
1594
|
+
* Anthropic Claude settings used by the Vercel AI SDK.
|
|
1600
1595
|
*/
|
|
1601
|
-
|
|
1602
|
-
|
|
1596
|
+
interface AnthropicResolvedConfig {
|
|
1597
|
+
readonly apiKey: string;
|
|
1598
|
+
readonly model: string;
|
|
1599
|
+
readonly temperature?: number;
|
|
1600
|
+
readonly maxOutputTokens?: number;
|
|
1601
|
+
readonly thinkingBudget?: number;
|
|
1602
|
+
readonly retry?: RetryConfig;
|
|
1603
|
+
}
|
|
1603
1604
|
/**
|
|
1604
|
-
*
|
|
1605
|
+
* Google Gemini settings used by the Vercel AI SDK.
|
|
1605
1606
|
*/
|
|
1606
|
-
|
|
1607
|
-
|
|
1607
|
+
interface GeminiResolvedConfig {
|
|
1608
|
+
readonly apiKey: string;
|
|
1609
|
+
readonly model: string;
|
|
1610
|
+
readonly temperature?: number;
|
|
1611
|
+
readonly maxOutputTokens?: number;
|
|
1612
|
+
readonly retry?: RetryConfig;
|
|
1613
|
+
}
|
|
1614
|
+
interface CodexResolvedConfig {
|
|
1615
|
+
readonly model?: string;
|
|
1616
|
+
readonly executable: string;
|
|
1617
|
+
readonly args?: readonly string[];
|
|
1618
|
+
readonly cwd?: string;
|
|
1619
|
+
readonly timeoutMs?: number;
|
|
1620
|
+
readonly logDir?: string;
|
|
1621
|
+
readonly logFormat?: 'summary' | 'json';
|
|
1622
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1623
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1624
|
+
readonly systemPrompt?: string;
|
|
1625
|
+
}
|
|
1626
|
+
interface CopilotCliResolvedConfig {
|
|
1627
|
+
readonly executable: string;
|
|
1628
|
+
readonly model?: string;
|
|
1629
|
+
readonly args?: readonly string[];
|
|
1630
|
+
readonly cwd?: string;
|
|
1631
|
+
readonly timeoutMs?: number;
|
|
1632
|
+
readonly logDir?: string;
|
|
1633
|
+
readonly logFormat?: 'summary' | 'json';
|
|
1634
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1635
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1636
|
+
readonly systemPrompt?: string;
|
|
1637
|
+
}
|
|
1638
|
+
interface CopilotSdkResolvedConfig {
|
|
1639
|
+
readonly cliUrl?: string;
|
|
1640
|
+
readonly cliPath?: string;
|
|
1641
|
+
readonly githubToken?: string;
|
|
1642
|
+
readonly model?: string;
|
|
1643
|
+
readonly cwd?: string;
|
|
1644
|
+
readonly timeoutMs?: number;
|
|
1645
|
+
readonly logDir?: string;
|
|
1646
|
+
readonly logFormat?: 'summary' | 'json';
|
|
1647
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1648
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1649
|
+
readonly systemPrompt?: string;
|
|
1650
|
+
/** BYOK provider type: "azure", "openai", or "anthropic". */
|
|
1651
|
+
readonly byokType?: string;
|
|
1652
|
+
/** BYOK base URL for the provider endpoint. */
|
|
1653
|
+
readonly byokBaseUrl?: string;
|
|
1654
|
+
/** BYOK API key for authenticating with the provider. */
|
|
1655
|
+
readonly byokApiKey?: string;
|
|
1656
|
+
/** BYOK bearer token (takes precedence over apiKey when set). */
|
|
1657
|
+
readonly byokBearerToken?: string;
|
|
1658
|
+
/** BYOK Azure API version (e.g. "2024-10-21"). Only used when byokType is "azure". */
|
|
1659
|
+
readonly byokApiVersion?: string;
|
|
1660
|
+
/** BYOK wire API format: "completions" or "responses". */
|
|
1661
|
+
readonly byokWireApi?: string;
|
|
1662
|
+
}
|
|
1663
|
+
interface CopilotLogResolvedConfig {
|
|
1664
|
+
/** Explicit path to a session directory containing events.jsonl. */
|
|
1665
|
+
readonly sessionDir?: string;
|
|
1666
|
+
/** Session UUID — combined with sessionStateDir to build the path. */
|
|
1667
|
+
readonly sessionId?: string;
|
|
1668
|
+
/** Auto-discovery mode. 'latest' picks the most recent session. */
|
|
1669
|
+
readonly discover?: 'latest';
|
|
1670
|
+
/** Override the default ~/.copilot/session-state directory. */
|
|
1671
|
+
readonly sessionStateDir?: string;
|
|
1672
|
+
/** Filter discovery by working directory. */
|
|
1673
|
+
readonly cwd?: string;
|
|
1674
|
+
}
|
|
1675
|
+
interface PiCodingAgentResolvedConfig {
|
|
1676
|
+
readonly subprovider?: string;
|
|
1677
|
+
readonly model?: string;
|
|
1678
|
+
readonly apiKey?: string;
|
|
1679
|
+
readonly baseUrl?: string;
|
|
1680
|
+
readonly tools?: string;
|
|
1681
|
+
readonly thinking?: string;
|
|
1682
|
+
readonly cwd?: string;
|
|
1683
|
+
readonly timeoutMs?: number;
|
|
1684
|
+
readonly logDir?: string;
|
|
1685
|
+
readonly logFormat?: 'summary' | 'json';
|
|
1686
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1687
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1688
|
+
readonly systemPrompt?: string;
|
|
1689
|
+
}
|
|
1690
|
+
interface PiCliResolvedConfig {
|
|
1691
|
+
readonly executable: string;
|
|
1692
|
+
readonly subprovider?: string;
|
|
1693
|
+
readonly model?: string;
|
|
1694
|
+
readonly apiKey?: string;
|
|
1695
|
+
readonly baseUrl?: string;
|
|
1696
|
+
readonly tools?: string;
|
|
1697
|
+
readonly thinking?: string;
|
|
1698
|
+
readonly args?: readonly string[];
|
|
1699
|
+
readonly cwd?: string;
|
|
1700
|
+
readonly timeoutMs?: number;
|
|
1701
|
+
readonly logDir?: string;
|
|
1702
|
+
readonly logFormat?: 'summary' | 'json';
|
|
1703
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1704
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1705
|
+
readonly systemPrompt?: string;
|
|
1706
|
+
}
|
|
1707
|
+
interface ClaudeResolvedConfig {
|
|
1708
|
+
readonly executable: string;
|
|
1709
|
+
readonly model?: string;
|
|
1710
|
+
readonly systemPrompt?: string;
|
|
1711
|
+
readonly cwd?: string;
|
|
1712
|
+
readonly timeoutMs?: number;
|
|
1713
|
+
readonly maxTurns?: number;
|
|
1714
|
+
readonly maxBudgetUsd?: number;
|
|
1715
|
+
readonly logDir?: string;
|
|
1716
|
+
readonly logFormat?: 'summary' | 'json';
|
|
1717
|
+
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1718
|
+
readonly streamLog?: false | 'raw' | 'summary';
|
|
1719
|
+
}
|
|
1720
|
+
interface MockResolvedConfig {
|
|
1721
|
+
readonly response?: string;
|
|
1722
|
+
readonly delayMs?: number;
|
|
1723
|
+
readonly delayMinMs?: number;
|
|
1724
|
+
readonly delayMaxMs?: number;
|
|
1725
|
+
}
|
|
1726
|
+
interface VSCodeResolvedConfig {
|
|
1727
|
+
readonly executable: string;
|
|
1728
|
+
readonly waitForResponse: boolean;
|
|
1729
|
+
readonly dryRun: boolean;
|
|
1730
|
+
readonly subagentRoot?: string;
|
|
1731
|
+
readonly timeoutMs?: number;
|
|
1732
|
+
}
|
|
1733
|
+
interface AgentVResolvedConfig {
|
|
1734
|
+
readonly model: string;
|
|
1735
|
+
readonly temperature: number;
|
|
1736
|
+
}
|
|
1737
|
+
/** Base fields shared by all resolved targets. */
|
|
1738
|
+
interface ResolvedTargetBase {
|
|
1739
|
+
readonly name: string;
|
|
1740
|
+
readonly graderTarget?: string;
|
|
1741
|
+
readonly workers?: number;
|
|
1742
|
+
readonly providerBatching?: boolean;
|
|
1743
|
+
/**
|
|
1744
|
+
* Whether this target can be executed via executor subagents in subagent mode.
|
|
1745
|
+
* Defaults to `true` for all non-CLI providers. Set `false` in targets.yaml
|
|
1746
|
+
* to force CLI invocation even in subagent mode.
|
|
1747
|
+
*/
|
|
1748
|
+
readonly subagentModeAllowed?: boolean;
|
|
1749
|
+
/**
|
|
1750
|
+
* Ordered list of target names to try when the primary target fails after
|
|
1751
|
+
* exhausting retries. Each fallback is attempted in order.
|
|
1752
|
+
*/
|
|
1753
|
+
readonly fallbackTargets?: readonly string[];
|
|
1754
|
+
}
|
|
1755
|
+
type ResolvedTarget = (ResolvedTargetBase & {
|
|
1756
|
+
readonly kind: 'openai';
|
|
1757
|
+
readonly config: OpenAIResolvedConfig;
|
|
1758
|
+
}) | (ResolvedTargetBase & {
|
|
1759
|
+
readonly kind: 'openrouter';
|
|
1760
|
+
readonly config: OpenRouterResolvedConfig;
|
|
1761
|
+
}) | (ResolvedTargetBase & {
|
|
1762
|
+
readonly kind: 'azure';
|
|
1763
|
+
readonly config: AzureResolvedConfig;
|
|
1764
|
+
}) | (ResolvedTargetBase & {
|
|
1765
|
+
readonly kind: 'anthropic';
|
|
1766
|
+
readonly config: AnthropicResolvedConfig;
|
|
1767
|
+
}) | (ResolvedTargetBase & {
|
|
1768
|
+
readonly kind: 'gemini';
|
|
1769
|
+
readonly config: GeminiResolvedConfig;
|
|
1770
|
+
}) | (ResolvedTargetBase & {
|
|
1771
|
+
readonly kind: 'codex';
|
|
1772
|
+
readonly config: CodexResolvedConfig;
|
|
1773
|
+
}) | (ResolvedTargetBase & {
|
|
1774
|
+
readonly kind: 'copilot-sdk';
|
|
1775
|
+
readonly config: CopilotSdkResolvedConfig;
|
|
1776
|
+
}) | (ResolvedTargetBase & {
|
|
1777
|
+
readonly kind: 'copilot-cli';
|
|
1778
|
+
readonly config: CopilotCliResolvedConfig;
|
|
1779
|
+
}) | (ResolvedTargetBase & {
|
|
1780
|
+
readonly kind: 'copilot-log';
|
|
1781
|
+
readonly config: CopilotLogResolvedConfig;
|
|
1782
|
+
}) | (ResolvedTargetBase & {
|
|
1783
|
+
readonly kind: 'pi-coding-agent';
|
|
1784
|
+
readonly config: PiCodingAgentResolvedConfig;
|
|
1785
|
+
}) | (ResolvedTargetBase & {
|
|
1786
|
+
readonly kind: 'pi-cli';
|
|
1787
|
+
readonly config: PiCliResolvedConfig;
|
|
1788
|
+
}) | (ResolvedTargetBase & {
|
|
1789
|
+
readonly kind: 'claude';
|
|
1790
|
+
readonly config: ClaudeResolvedConfig;
|
|
1791
|
+
}) | (ResolvedTargetBase & {
|
|
1792
|
+
readonly kind: 'claude-cli';
|
|
1793
|
+
readonly config: ClaudeResolvedConfig;
|
|
1794
|
+
}) | (ResolvedTargetBase & {
|
|
1795
|
+
readonly kind: 'claude-sdk';
|
|
1796
|
+
readonly config: ClaudeResolvedConfig;
|
|
1797
|
+
}) | (ResolvedTargetBase & {
|
|
1798
|
+
readonly kind: 'mock';
|
|
1799
|
+
readonly config: MockResolvedConfig;
|
|
1800
|
+
}) | (ResolvedTargetBase & {
|
|
1801
|
+
readonly kind: 'vscode' | 'vscode-insiders';
|
|
1802
|
+
readonly config: VSCodeResolvedConfig;
|
|
1803
|
+
}) | (ResolvedTargetBase & {
|
|
1804
|
+
readonly kind: 'agentv';
|
|
1805
|
+
readonly config: AgentVResolvedConfig;
|
|
1806
|
+
}) | (ResolvedTargetBase & {
|
|
1807
|
+
readonly kind: 'cli';
|
|
1808
|
+
readonly config: CliResolvedConfig;
|
|
1809
|
+
}) | (ResolvedTargetBase & {
|
|
1810
|
+
readonly kind: 'transcript';
|
|
1811
|
+
readonly config: Record<string, never>;
|
|
1812
|
+
});
|
|
1813
|
+
/**
|
|
1814
|
+
* Optional settings accepted on ALL target definitions regardless of provider.
|
|
1815
|
+
* Exported so the targets validator can reuse the same list — adding a field
|
|
1816
|
+
* here automatically makes it valid in targets.yaml without a separate update.
|
|
1817
|
+
*/
|
|
1818
|
+
declare const COMMON_TARGET_SETTINGS: readonly ["use_target", "provider_batching", "subagent_mode_allowed", "fallback_targets"];
|
|
1819
|
+
declare function resolveDelegatedTargetDefinition(name: string, definitions: ReadonlyMap<string, TargetDefinition>, env?: EnvLookup): TargetDefinition | undefined;
|
|
1820
|
+
declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string, options?: {
|
|
1821
|
+
readonly emitDeprecationWarnings?: boolean;
|
|
1822
|
+
}): ResolvedTarget;
|
|
1823
|
+
|
|
1824
|
+
/**
|
|
1825
|
+
* Extensible provider registry.
|
|
1826
|
+
*
|
|
1827
|
+
* Replaces the hardcoded switch/case dispatch in createProvider() with
|
|
1828
|
+
* a registry of named factory functions. Built-in providers are registered
|
|
1829
|
+
* at startup; users can add custom providers via the registry API or by
|
|
1830
|
+
* dropping files in `.agentv/providers/`.
|
|
1831
|
+
*/
|
|
1832
|
+
|
|
1833
|
+
/**
|
|
1834
|
+
* Factory function that creates a Provider instance from a resolved target.
|
|
1835
|
+
*/
|
|
1836
|
+
type ProviderFactoryFn = (target: ResolvedTarget) => Provider;
|
|
1837
|
+
/**
|
|
1838
|
+
* Registry of provider factory functions keyed by provider kind.
|
|
1839
|
+
*
|
|
1840
|
+
* Built-in providers are registered at startup. Custom providers can be
|
|
1841
|
+
* registered via the `register()` method.
|
|
1842
|
+
*/
|
|
1843
|
+
declare class ProviderRegistry {
|
|
1844
|
+
private readonly factories;
|
|
1845
|
+
/** Register a factory function for a provider kind. */
|
|
1846
|
+
register(kind: string, factory: ProviderFactoryFn): this;
|
|
1847
|
+
/** Get the factory function for a provider kind. */
|
|
1848
|
+
get(kind: string): ProviderFactoryFn | undefined;
|
|
1849
|
+
/** Check if a factory is registered for the given kind. */
|
|
1850
|
+
has(kind: string): boolean;
|
|
1851
|
+
/** List all registered provider kind names. */
|
|
1852
|
+
list(): string[];
|
|
1853
|
+
/**
|
|
1854
|
+
* Create a provider instance from a resolved target.
|
|
1855
|
+
* Falls back to CLI provider for unknown kinds (custom provider escape hatch).
|
|
1856
|
+
*/
|
|
1857
|
+
create(target: ResolvedTarget): Provider;
|
|
1858
|
+
}
|
|
1859
|
+
|
|
1860
|
+
declare const MetadataSchema: z.ZodObject<{
|
|
1861
|
+
name: z.ZodString;
|
|
1862
|
+
description: z.ZodOptional<z.ZodString>;
|
|
1863
|
+
version: z.ZodOptional<z.ZodString>;
|
|
1864
|
+
author: z.ZodOptional<z.ZodString>;
|
|
1865
|
+
tags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
1866
|
+
license: z.ZodOptional<z.ZodString>;
|
|
1867
|
+
requires: z.ZodOptional<z.ZodObject<{
|
|
1868
|
+
agentv: z.ZodOptional<z.ZodString>;
|
|
1869
|
+
}, "strip", z.ZodTypeAny, {
|
|
1870
|
+
agentv?: string | undefined;
|
|
1871
|
+
}, {
|
|
1872
|
+
agentv?: string | undefined;
|
|
1873
|
+
}>>;
|
|
1874
|
+
}, "strip", z.ZodTypeAny, {
|
|
1875
|
+
name: string;
|
|
1876
|
+
description?: string | undefined;
|
|
1877
|
+
version?: string | undefined;
|
|
1878
|
+
author?: string | undefined;
|
|
1879
|
+
tags?: string[] | undefined;
|
|
1880
|
+
license?: string | undefined;
|
|
1881
|
+
requires?: {
|
|
1882
|
+
agentv?: string | undefined;
|
|
1883
|
+
} | undefined;
|
|
1884
|
+
}, {
|
|
1885
|
+
name: string;
|
|
1886
|
+
description?: string | undefined;
|
|
1887
|
+
version?: string | undefined;
|
|
1888
|
+
author?: string | undefined;
|
|
1889
|
+
tags?: string[] | undefined;
|
|
1890
|
+
license?: string | undefined;
|
|
1891
|
+
requires?: {
|
|
1892
|
+
agentv?: string | undefined;
|
|
1893
|
+
} | undefined;
|
|
1894
|
+
}>;
|
|
1895
|
+
type EvalMetadata = z.infer<typeof MetadataSchema>;
|
|
1896
|
+
|
|
1897
|
+
declare const DEFAULT_EVAL_PATTERNS: readonly string[];
|
|
1898
|
+
type ExecutionDefaults = {
|
|
1899
|
+
readonly verbose?: boolean;
|
|
1900
|
+
readonly keep_workspaces?: boolean;
|
|
1901
|
+
readonly otel_file?: string;
|
|
1902
|
+
readonly export_otel?: boolean;
|
|
1903
|
+
readonly otel_backend?: string;
|
|
1904
|
+
readonly otel_capture_content?: boolean;
|
|
1905
|
+
readonly otel_group_turns?: boolean;
|
|
1906
|
+
readonly pool_workspaces?: boolean;
|
|
1907
|
+
readonly pool_slots?: number;
|
|
1908
|
+
};
|
|
1909
|
+
type ResultsExportConfig = {
|
|
1910
|
+
readonly repo: string;
|
|
1911
|
+
readonly path: string;
|
|
1912
|
+
readonly auto_push?: boolean;
|
|
1913
|
+
readonly branch_prefix?: string;
|
|
1914
|
+
};
|
|
1915
|
+
type AgentVConfig$1 = {
|
|
1916
|
+
readonly required_version?: string;
|
|
1917
|
+
readonly eval_patterns?: readonly string[];
|
|
1918
|
+
readonly execution?: ExecutionDefaults;
|
|
1919
|
+
readonly results?: {
|
|
1920
|
+
readonly export?: ResultsExportConfig;
|
|
1921
|
+
};
|
|
1922
|
+
};
|
|
1923
|
+
/**
|
|
1924
|
+
* Load optional .agentv/config.yaml configuration file.
|
|
1925
|
+
* Searches from eval file directory up to repo root.
|
|
1926
|
+
*/
|
|
1927
|
+
declare function loadConfig(evalFilePath: string, repoRoot: string): Promise<AgentVConfig$1 | null>;
|
|
1928
|
+
/**
|
|
1929
|
+
* Extract target name from parsed eval suite (checks execution.target then falls back to root-level target).
|
|
1930
|
+
*/
|
|
1931
|
+
declare function extractTargetFromSuite(suite: JsonObject): string | undefined;
|
|
1932
|
+
/**
|
|
1933
|
+
* Extract target refs from parsed eval suite.
|
|
1934
|
+
* Supports both string shorthand and object form with hooks.
|
|
1935
|
+
* Returns undefined when no targets array is specified.
|
|
1936
|
+
*/
|
|
1937
|
+
declare function extractTargetRefsFromSuite(suite: JsonObject): readonly EvalTargetRef[] | undefined;
|
|
1938
|
+
/**
|
|
1939
|
+
* Extract target names from parsed eval suite (backward-compat wrapper).
|
|
1940
|
+
* Precedence: execution.targets (array) > execution.target (singular).
|
|
1941
|
+
* Returns undefined when no targets array is specified.
|
|
1942
|
+
*/
|
|
1943
|
+
declare function extractTargetsFromSuite(suite: JsonObject): readonly string[] | undefined;
|
|
1944
|
+
/**
|
|
1945
|
+
* Extract workers count from suite-level execution block.
|
|
1946
|
+
*/
|
|
1947
|
+
declare function extractWorkersFromSuite(suite: JsonObject): number | undefined;
|
|
1948
|
+
/**
|
|
1949
|
+
* Extract per-test targets array from a raw test case object.
|
|
1950
|
+
*/
|
|
1951
|
+
declare function extractTargetsFromTestCase(testCase: JsonObject): readonly string[] | undefined;
|
|
1952
|
+
/**
|
|
1953
|
+
* Extract trials configuration from parsed eval suite's execution block.
|
|
1954
|
+
* Returns undefined when count is 1 or not specified (no-op).
|
|
1955
|
+
*/
|
|
1956
|
+
declare function extractTrialsConfig(suite: JsonObject): TrialsConfig | undefined;
|
|
1957
|
+
/**
|
|
1958
|
+
* Cache configuration parsed from execution block.
|
|
1959
|
+
*/
|
|
1960
|
+
interface CacheConfig {
|
|
1961
|
+
readonly enabled: boolean;
|
|
1962
|
+
readonly cachePath?: string;
|
|
1963
|
+
}
|
|
1964
|
+
/**
|
|
1965
|
+
* Extract cache configuration from parsed eval suite's execution block.
|
|
1966
|
+
* Returns undefined when no cache config is specified.
|
|
1967
|
+
*/
|
|
1968
|
+
declare function extractCacheConfig(suite: JsonObject): CacheConfig | undefined;
|
|
1969
|
+
/**
|
|
1970
|
+
* Extract `execution.fail_on_error` from parsed eval suite.
|
|
1971
|
+
* Accepts `true` or `false`.
|
|
1972
|
+
* Returns undefined when not specified.
|
|
1973
|
+
*/
|
|
1974
|
+
declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
|
|
1975
|
+
/**
|
|
1976
|
+
* Extract `execution.threshold` from parsed eval suite.
|
|
1977
|
+
* Accepts a number in [0, 1] range.
|
|
1978
|
+
* Returns undefined when not specified.
|
|
1979
|
+
*/
|
|
1980
|
+
declare function extractThreshold(suite: JsonObject): number | undefined;
|
|
1981
|
+
|
|
1982
|
+
/**
|
|
1983
|
+
* Formatting mode for segment content.
|
|
1984
|
+
* - 'agent': File references only (for providers with filesystem access)
|
|
1985
|
+
* - 'lm': Embedded file content with XML tags (for language model providers)
|
|
1986
|
+
*/
|
|
1987
|
+
type FormattingMode = 'agent' | 'lm';
|
|
1988
|
+
|
|
1989
|
+
/**
|
|
1990
|
+
* Build prompt inputs by consolidating user request context.
|
|
1991
|
+
*/
|
|
1992
|
+
interface PromptInputs {
|
|
1993
|
+
readonly question: string;
|
|
1994
|
+
readonly chatPrompt?: ChatPrompt;
|
|
1995
|
+
readonly systemMessage?: string;
|
|
1996
|
+
}
|
|
1997
|
+
/**
|
|
1998
|
+
* Build prompt inputs by consolidating user request context.
|
|
1999
|
+
*
|
|
2000
|
+
* @param testCase - The evaluation test case
|
|
2001
|
+
* @param mode - Formatting mode: 'agent' for file references, 'lm' for embedded content (default: 'lm')
|
|
2002
|
+
*/
|
|
2003
|
+
declare function buildPromptInputs(testCase: EvalTest, mode?: FormattingMode): Promise<PromptInputs>;
|
|
2004
|
+
|
|
2005
|
+
/**
|
|
2006
|
+
* Detect file format by extension.
|
|
2007
|
+
*/
|
|
2008
|
+
declare function detectFormat(filePath: string): 'yaml' | 'jsonl' | 'agent-skills-json' | 'typescript';
|
|
2009
|
+
|
|
1608
2010
|
type LoadOptions = {
|
|
1609
2011
|
readonly verbose?: boolean;
|
|
1610
2012
|
/** Filter tests by ID pattern(s) (glob supported, e.g., "summary-*"). Arrays use OR logic. */
|
|
@@ -1647,6 +2049,10 @@ type EvalSuiteResult = {
|
|
|
1647
2049
|
readonly threshold?: number;
|
|
1648
2050
|
/** Resolved workspace.path from the eval YAML (after env-var expansion), if set */
|
|
1649
2051
|
readonly workspacePath?: string;
|
|
2052
|
+
/** Inline target definition from a TS eval config. */
|
|
2053
|
+
readonly inlineTarget?: TargetDefinition;
|
|
2054
|
+
/** Custom provider factory from a TS eval config task(). */
|
|
2055
|
+
readonly providerFactory?: ProviderFactoryFn;
|
|
1650
2056
|
};
|
|
1651
2057
|
/**
|
|
1652
2058
|
* Load tests and suite metadata from a single parse.
|
|
@@ -1693,495 +2099,370 @@ declare function isAgentSkillsFormat(parsed: unknown): parsed is AgentSkillsEval
|
|
|
1693
2099
|
declare function parseAgentSkillsEvals(parsed: unknown, source?: string, baseDir?: string): readonly EvalTest[];
|
|
1694
2100
|
|
|
1695
2101
|
/**
|
|
1696
|
-
*
|
|
2102
|
+
* Types for inline assertion functions used in the evaluate() API.
|
|
1697
2103
|
*
|
|
1698
|
-
*
|
|
1699
|
-
*
|
|
2104
|
+
* Inline functions are the escape hatch for custom evaluation logic
|
|
2105
|
+
* that doesn't fit a built-in grader type. For built-in assertions
|
|
2106
|
+
* (contains, regex, is-json, etc.), use config objects instead:
|
|
1700
2107
|
*
|
|
1701
|
-
*
|
|
1702
|
-
*/
|
|
1703
|
-
interface EvalsJsonCase {
|
|
1704
|
-
id: number;
|
|
1705
|
-
prompt: string;
|
|
1706
|
-
expected_output?: string;
|
|
1707
|
-
files?: string[];
|
|
1708
|
-
should_trigger?: boolean;
|
|
1709
|
-
assertions: string[];
|
|
1710
|
-
}
|
|
1711
|
-
interface EvalsJsonFile {
|
|
1712
|
-
skill_name: string;
|
|
1713
|
-
evals: EvalsJsonCase[];
|
|
1714
|
-
}
|
|
1715
|
-
/**
|
|
1716
|
-
* Result of transpiling a single EVAL.yaml.
|
|
1717
|
-
* May produce multiple evals.json files (one per skill).
|
|
1718
|
-
*/
|
|
1719
|
-
interface TranspileResult {
|
|
1720
|
-
/** Map from skill_name → EvalsJsonFile */
|
|
1721
|
-
files: Map<string, EvalsJsonFile>;
|
|
1722
|
-
/** Warning messages accumulated during transpilation */
|
|
1723
|
-
warnings: string[];
|
|
1724
|
-
}
|
|
1725
|
-
/**
|
|
1726
|
-
* Transpile a parsed EVAL.yaml object into one or more evals.json objects.
|
|
2108
|
+
* assert: [{ type: 'contains', value: 'hello' }]
|
|
1727
2109
|
*
|
|
1728
|
-
*
|
|
1729
|
-
* @param source Source identifier for error messages (e.g. file path)
|
|
1730
|
-
*/
|
|
1731
|
-
declare function transpileEvalYaml(suite: unknown, source?: string): TranspileResult;
|
|
1732
|
-
/**
|
|
1733
|
-
* Transpile an EVAL.yaml file into one or more evals.json objects.
|
|
1734
|
-
* Returns a map from output filename → JSON content.
|
|
2110
|
+
* Inline functions are for custom logic:
|
|
1735
2111
|
*
|
|
1736
|
-
*
|
|
1737
|
-
*/
|
|
1738
|
-
declare function transpileEvalYamlFile(evalYamlPath: string): TranspileResult;
|
|
1739
|
-
/**
|
|
1740
|
-
* Determine the output filename(s) for a transpile result.
|
|
1741
|
-
* Single skill → "evals.json"
|
|
1742
|
-
* Multiple skills → "<skill>.evals.json"
|
|
1743
|
-
*/
|
|
1744
|
-
declare function getOutputFilenames(result: TranspileResult): Map<string, string>;
|
|
1745
|
-
|
|
1746
|
-
declare function fileExists(filePath: string): Promise<boolean>;
|
|
1747
|
-
/**
|
|
1748
|
-
* Normalize line endings to LF (\n).
|
|
1749
|
-
* This ensures consistent behavior across Windows (CRLF) and Unix (LF) systems.
|
|
1750
|
-
*/
|
|
1751
|
-
declare function normalizeLineEndings(content: string): string;
|
|
1752
|
-
/**
|
|
1753
|
-
* Read a text file and normalize line endings to LF (\n).
|
|
1754
|
-
* This ensures consistent behavior across Windows (CRLF) and Unix (LF) systems.
|
|
1755
|
-
*/
|
|
1756
|
-
declare function readTextFile(filePath: string): Promise<string>;
|
|
1757
|
-
/**
|
|
1758
|
-
* Read a JSON file and parse it.
|
|
1759
|
-
*/
|
|
1760
|
-
declare function readJsonFile<T = unknown>(filePath: string): Promise<T>;
|
|
1761
|
-
/**
|
|
1762
|
-
* Find git repository root by walking up the directory tree.
|
|
1763
|
-
*/
|
|
1764
|
-
declare function findGitRoot(startPath: string): Promise<string | null>;
|
|
1765
|
-
/**
|
|
1766
|
-
* Build a chain of directories walking from a file's location up to repo root.
|
|
1767
|
-
* Used for discovering configuration files like targets.yaml or config.yaml.
|
|
1768
|
-
*/
|
|
1769
|
-
declare function buildDirectoryChain(filePath: string, repoRoot: string): readonly string[];
|
|
1770
|
-
/**
|
|
1771
|
-
* Build search roots for file resolution, matching yaml-parser behavior.
|
|
1772
|
-
* Searches from eval file directory up to repo root.
|
|
1773
|
-
*/
|
|
1774
|
-
declare function buildSearchRoots(evalPath: string, repoRoot: string): readonly string[];
|
|
1775
|
-
/**
|
|
1776
|
-
* Resolve a file reference using search roots, matching yaml-parser behavior.
|
|
2112
|
+
* assert: [({ output }) => ({ name: 'len', score: output.length > 5 ? 1 : 0 })]
|
|
1777
2113
|
*/
|
|
1778
|
-
|
|
1779
|
-
|
|
1780
|
-
readonly
|
|
1781
|
-
readonly
|
|
1782
|
-
|
|
2114
|
+
/** Context passed to inline assertion functions */
|
|
2115
|
+
interface AssertContext {
|
|
2116
|
+
readonly input: string;
|
|
2117
|
+
readonly output: string;
|
|
2118
|
+
readonly expectedOutput?: string;
|
|
2119
|
+
readonly criteria?: string;
|
|
2120
|
+
readonly metadata?: Record<string, unknown>;
|
|
2121
|
+
}
|
|
2122
|
+
/** Result from an inline assertion function */
|
|
2123
|
+
interface AssertResult {
|
|
2124
|
+
readonly name: string;
|
|
2125
|
+
readonly score: number;
|
|
2126
|
+
readonly metadata?: Record<string, unknown>;
|
|
2127
|
+
}
|
|
2128
|
+
/** Inline assertion function signature */
|
|
2129
|
+
type AssertFn = (ctx: AssertContext) => AssertResult | Promise<AssertResult>;
|
|
1783
2130
|
|
|
1784
2131
|
/**
|
|
1785
|
-
*
|
|
1786
|
-
* This is the final validated shape after environment variable resolution
|
|
1787
|
-
* and internal field normalization.
|
|
2132
|
+
* Programmatic API for running evaluations.
|
|
1788
2133
|
*
|
|
1789
|
-
*
|
|
1790
|
-
*
|
|
2134
|
+
* Provides `evaluate()` — a high-level function for using AgentV as a library
|
|
2135
|
+
* instead of a CLI. The config shape mirrors the YAML structure for easy
|
|
2136
|
+
* translation between file-based and programmatic usage.
|
|
1791
2137
|
*
|
|
1792
|
-
* @example
|
|
2138
|
+
* @example Inline tests with config objects
|
|
1793
2139
|
* ```typescript
|
|
1794
|
-
*
|
|
1795
|
-
*
|
|
1796
|
-
*
|
|
1797
|
-
*
|
|
1798
|
-
*
|
|
1799
|
-
*
|
|
2140
|
+
* import { evaluate } from '@agentv/core';
|
|
2141
|
+
*
|
|
2142
|
+
* const results = await evaluate({
|
|
2143
|
+
* tests: [
|
|
2144
|
+
* {
|
|
2145
|
+
* id: 'capital',
|
|
2146
|
+
* input: 'What is the capital of France?',
|
|
2147
|
+
* expectedOutput: 'Paris',
|
|
2148
|
+
* assert: [{ type: 'contains', value: 'Paris' }],
|
|
2149
|
+
* },
|
|
2150
|
+
* ],
|
|
2151
|
+
* target: { provider: 'mock_agent' },
|
|
2152
|
+
* });
|
|
2153
|
+
*
|
|
2154
|
+
* console.log(results.summary.passed, 'passed');
|
|
2155
|
+
* ```
|
|
2156
|
+
*
|
|
2157
|
+
* @example Inline tests with task function and custom assertion
|
|
2158
|
+
* ```typescript
|
|
2159
|
+
* import { evaluate } from '@agentv/core';
|
|
2160
|
+
*
|
|
2161
|
+
* const { summary } = await evaluate({
|
|
2162
|
+
* tests: [
|
|
2163
|
+
* {
|
|
2164
|
+
* id: 'echo',
|
|
2165
|
+
* input: 'hello',
|
|
2166
|
+
* expectedOutput: 'Echo: hello',
|
|
2167
|
+
* assert: [
|
|
2168
|
+
* { type: 'contains', value: 'hello' },
|
|
2169
|
+
* { type: 'equals' },
|
|
2170
|
+
* ({ output }) => ({ name: 'custom', score: output.length > 0 ? 1 : 0 }),
|
|
2171
|
+
* ],
|
|
2172
|
+
* },
|
|
2173
|
+
* ],
|
|
2174
|
+
* task: async (input) => `Echo: ${input}`,
|
|
2175
|
+
* });
|
|
1800
2176
|
* ```
|
|
1801
|
-
*/
|
|
1802
|
-
declare const CliTargetConfigSchema: z.ZodObject<{
|
|
1803
|
-
command: z.ZodString;
|
|
1804
|
-
filesFormat: z.ZodOptional<z.ZodString>;
|
|
1805
|
-
cwd: z.ZodOptional<z.ZodString>;
|
|
1806
|
-
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
1807
|
-
healthcheck: z.ZodOptional<z.ZodUnion<[z.ZodObject<{
|
|
1808
|
-
url: z.ZodString;
|
|
1809
|
-
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
1810
|
-
}, "strict", z.ZodTypeAny, {
|
|
1811
|
-
url: string;
|
|
1812
|
-
timeoutMs?: number | undefined;
|
|
1813
|
-
}, {
|
|
1814
|
-
url: string;
|
|
1815
|
-
timeoutMs?: number | undefined;
|
|
1816
|
-
}>, z.ZodObject<{
|
|
1817
|
-
command: z.ZodString;
|
|
1818
|
-
cwd: z.ZodOptional<z.ZodString>;
|
|
1819
|
-
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
1820
|
-
}, "strict", z.ZodTypeAny, {
|
|
1821
|
-
command: string;
|
|
1822
|
-
timeoutMs?: number | undefined;
|
|
1823
|
-
cwd?: string | undefined;
|
|
1824
|
-
}, {
|
|
1825
|
-
command: string;
|
|
1826
|
-
timeoutMs?: number | undefined;
|
|
1827
|
-
cwd?: string | undefined;
|
|
1828
|
-
}>]>>;
|
|
1829
|
-
verbose: z.ZodOptional<z.ZodBoolean>;
|
|
1830
|
-
keepTempFiles: z.ZodOptional<z.ZodBoolean>;
|
|
1831
|
-
}, "strict", z.ZodTypeAny, {
|
|
1832
|
-
command: string;
|
|
1833
|
-
timeoutMs?: number | undefined;
|
|
1834
|
-
cwd?: string | undefined;
|
|
1835
|
-
verbose?: boolean | undefined;
|
|
1836
|
-
healthcheck?: {
|
|
1837
|
-
url: string;
|
|
1838
|
-
timeoutMs?: number | undefined;
|
|
1839
|
-
} | {
|
|
1840
|
-
command: string;
|
|
1841
|
-
timeoutMs?: number | undefined;
|
|
1842
|
-
cwd?: string | undefined;
|
|
1843
|
-
} | undefined;
|
|
1844
|
-
filesFormat?: string | undefined;
|
|
1845
|
-
keepTempFiles?: boolean | undefined;
|
|
1846
|
-
}, {
|
|
1847
|
-
command: string;
|
|
1848
|
-
timeoutMs?: number | undefined;
|
|
1849
|
-
cwd?: string | undefined;
|
|
1850
|
-
verbose?: boolean | undefined;
|
|
1851
|
-
healthcheck?: {
|
|
1852
|
-
url: string;
|
|
1853
|
-
timeoutMs?: number | undefined;
|
|
1854
|
-
} | {
|
|
1855
|
-
command: string;
|
|
1856
|
-
timeoutMs?: number | undefined;
|
|
1857
|
-
cwd?: string | undefined;
|
|
1858
|
-
} | undefined;
|
|
1859
|
-
filesFormat?: string | undefined;
|
|
1860
|
-
keepTempFiles?: boolean | undefined;
|
|
1861
|
-
}>;
|
|
1862
|
-
type CliNormalizedConfig = z.infer<typeof CliTargetConfigSchema>;
|
|
1863
|
-
/**
|
|
1864
|
-
* Resolved CLI configuration type derived from CliTargetConfigSchema.
|
|
1865
|
-
* This is the final validated shape used by the CLI provider at runtime.
|
|
1866
|
-
* Using Readonly to ensure immutability for runtime safety.
|
|
1867
|
-
*/
|
|
1868
|
-
type CliResolvedConfig = Readonly<CliNormalizedConfig>;
|
|
1869
|
-
interface RetryConfig {
|
|
1870
|
-
readonly maxRetries?: number;
|
|
1871
|
-
readonly initialDelayMs?: number;
|
|
1872
|
-
readonly maxDelayMs?: number;
|
|
1873
|
-
readonly backoffFactor?: number;
|
|
1874
|
-
readonly retryableStatusCodes?: readonly number[];
|
|
1875
|
-
}
|
|
1876
|
-
/**
|
|
1877
|
-
* Selects which OpenAI-compatible API endpoint to use.
|
|
1878
|
-
* - "chat" (default): POST /chat/completions — universally supported by all OpenAI-compatible providers.
|
|
1879
|
-
* - "responses": POST /responses — only supported by api.openai.com.
|
|
1880
2177
|
*
|
|
1881
|
-
*
|
|
2178
|
+
* @example File-based
|
|
2179
|
+
* ```typescript
|
|
2180
|
+
* const results = await evaluate({
|
|
2181
|
+
* specFile: './evals/EVAL.yaml',
|
|
2182
|
+
* target: { provider: 'claude_agent' },
|
|
2183
|
+
* });
|
|
2184
|
+
* ```
|
|
2185
|
+
*
|
|
2186
|
+
* @module
|
|
1882
2187
|
*/
|
|
1883
|
-
|
|
2188
|
+
|
|
1884
2189
|
/**
|
|
1885
|
-
*
|
|
2190
|
+
* Inline test definition for the programmatic API.
|
|
2191
|
+
* Mirrors the YAML test structure.
|
|
1886
2192
|
*/
|
|
1887
|
-
interface
|
|
1888
|
-
|
|
1889
|
-
readonly
|
|
1890
|
-
|
|
1891
|
-
readonly
|
|
1892
|
-
|
|
1893
|
-
readonly
|
|
1894
|
-
|
|
1895
|
-
|
|
2193
|
+
interface EvalTestInput {
|
|
2194
|
+
/** Unique test identifier */
|
|
2195
|
+
readonly id: string;
|
|
2196
|
+
/** What the response should accomplish */
|
|
2197
|
+
readonly criteria?: string;
|
|
2198
|
+
/** Input to the agent (string or message array). Omit when using turns[]. */
|
|
2199
|
+
readonly input?: string | readonly {
|
|
2200
|
+
role: string;
|
|
2201
|
+
content: string;
|
|
2202
|
+
}[];
|
|
2203
|
+
/** Expected reference output (camelCase preferred) */
|
|
2204
|
+
readonly expectedOutput?: string;
|
|
2205
|
+
/** @deprecated Use `expectedOutput` instead */
|
|
2206
|
+
readonly expected_output?: string;
|
|
2207
|
+
/** Assertion graders — accepts factory functions, config objects, or inline functions */
|
|
2208
|
+
readonly assert?: readonly AssertEntry[];
|
|
2209
|
+
/** Arbitrary metadata */
|
|
2210
|
+
readonly metadata?: Record<string, unknown>;
|
|
2211
|
+
/** Enable multi-turn conversation mode. Inferred automatically when turns[] is provided. */
|
|
2212
|
+
readonly mode?: 'conversation';
|
|
2213
|
+
/** Ordered turns for conversation evaluation. Each turn generates a fresh LLM call. */
|
|
2214
|
+
readonly turns?: readonly ConversationTurnInput[];
|
|
2215
|
+
/** Score aggregation across turns: 'mean' (default), 'min', or 'max'. */
|
|
2216
|
+
readonly aggregation?: ConversationAggregation;
|
|
1896
2217
|
}
|
|
1897
2218
|
/**
|
|
1898
|
-
*
|
|
2219
|
+
* A single turn in a multi-turn conversation evaluation (programmatic API).
|
|
2220
|
+
* Mirrors the YAML `turns` structure with camelCase naming.
|
|
1899
2221
|
*/
|
|
1900
|
-
interface
|
|
1901
|
-
|
|
1902
|
-
readonly
|
|
1903
|
-
|
|
1904
|
-
|
|
1905
|
-
|
|
1906
|
-
|
|
1907
|
-
readonly
|
|
2222
|
+
interface ConversationTurnInput {
|
|
2223
|
+
/** Input for this turn (string or message array) */
|
|
2224
|
+
readonly input: string | readonly {
|
|
2225
|
+
role: string;
|
|
2226
|
+
content: string;
|
|
2227
|
+
}[];
|
|
2228
|
+
/** Expected reference output for this turn */
|
|
2229
|
+
readonly expectedOutput?: string;
|
|
2230
|
+
/** @deprecated Use `expectedOutput` instead */
|
|
2231
|
+
readonly expected_output?: string;
|
|
2232
|
+
/** Per-turn assertions (string criteria or grader config) */
|
|
2233
|
+
readonly assert?: readonly AssertEntry[];
|
|
1908
2234
|
}
|
|
1909
2235
|
/**
|
|
1910
|
-
*
|
|
2236
|
+
* Inline assertion definition for the programmatic API.
|
|
2237
|
+
* Matches the YAML `assert` block structure.
|
|
1911
2238
|
*/
|
|
1912
|
-
interface
|
|
1913
|
-
|
|
1914
|
-
readonly
|
|
1915
|
-
|
|
1916
|
-
readonly
|
|
1917
|
-
|
|
2239
|
+
interface EvalAssertionInput {
|
|
2240
|
+
/** Assertion type (e.g., 'contains', 'llm-grader', 'code-grader') */
|
|
2241
|
+
readonly type: string;
|
|
2242
|
+
/** Display name */
|
|
2243
|
+
readonly name?: string;
|
|
2244
|
+
/** Value for deterministic assertions (contains, equals, regex) */
|
|
2245
|
+
readonly value?: string;
|
|
2246
|
+
/** Weight for scoring */
|
|
2247
|
+
readonly weight?: number;
|
|
2248
|
+
/** Whether this assertion is required to pass */
|
|
2249
|
+
readonly required?: boolean | number;
|
|
2250
|
+
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
2251
|
+
readonly min_score?: number;
|
|
2252
|
+
/** Prompt file for llm_grader */
|
|
2253
|
+
readonly prompt?: string;
|
|
2254
|
+
/** Script for code_grader */
|
|
2255
|
+
readonly script?: string | readonly string[];
|
|
2256
|
+
/** Additional config passed to the assertion */
|
|
2257
|
+
readonly config?: Record<string, unknown>;
|
|
2258
|
+
/** Nested assertions for composite type */
|
|
2259
|
+
readonly assert?: readonly EvalAssertionInput[];
|
|
2260
|
+
/** Rubric criteria for rubrics type */
|
|
2261
|
+
readonly criteria?: readonly (string | {
|
|
2262
|
+
id?: string;
|
|
2263
|
+
outcome: string;
|
|
2264
|
+
weight?: number;
|
|
2265
|
+
})[];
|
|
2266
|
+
/** Additional properties */
|
|
2267
|
+
readonly [key: string]: unknown;
|
|
1918
2268
|
}
|
|
2269
|
+
/** Assert entry: inline function or config object */
|
|
2270
|
+
type AssertEntry = AssertFn | EvalAssertionInput;
|
|
1919
2271
|
/**
|
|
1920
|
-
*
|
|
2272
|
+
* Configuration for `evaluate()`.
|
|
2273
|
+
* Accepts either inline tests or a spec file path.
|
|
1921
2274
|
*/
|
|
1922
|
-
interface
|
|
1923
|
-
|
|
1924
|
-
readonly
|
|
1925
|
-
|
|
1926
|
-
readonly
|
|
1927
|
-
|
|
1928
|
-
readonly
|
|
2275
|
+
interface EvalConfig {
|
|
2276
|
+
/** Inline test definitions (mutually exclusive with specFile) */
|
|
2277
|
+
readonly tests?: readonly EvalTestInput[];
|
|
2278
|
+
/** Path to an EVAL.yaml spec file (mutually exclusive with tests) */
|
|
2279
|
+
readonly specFile?: string;
|
|
2280
|
+
/** Target provider configuration */
|
|
2281
|
+
readonly target?: TargetDefinition;
|
|
2282
|
+
/** Custom task function — mutually exclusive with target */
|
|
2283
|
+
readonly task?: (input: string) => string | Promise<string>;
|
|
2284
|
+
/** Suite-level assertions applied to all tests */
|
|
2285
|
+
readonly assert?: readonly AssertEntry[];
|
|
2286
|
+
/** Optional suite metadata used by CLI discovery, tagging, and reporting. */
|
|
2287
|
+
readonly metadata?: EvalMetadata;
|
|
2288
|
+
/** Filter tests by ID pattern(s) (glob supported). Arrays use OR logic. */
|
|
2289
|
+
readonly filter?: string | readonly string[];
|
|
2290
|
+
/** Maximum concurrent workers (default: 3) */
|
|
2291
|
+
readonly workers?: number;
|
|
2292
|
+
/** Maximum retries on failure (default: 2) */
|
|
2293
|
+
readonly maxRetries?: number;
|
|
2294
|
+
/** Agent timeout in milliseconds. No timeout if not set. */
|
|
2295
|
+
readonly agentTimeoutMs?: number;
|
|
2296
|
+
/** Enable response caching */
|
|
2297
|
+
readonly cache?: boolean;
|
|
2298
|
+
/** Verbose logging */
|
|
2299
|
+
readonly verbose?: boolean;
|
|
2300
|
+
/** Callback for each completed result */
|
|
2301
|
+
readonly onResult?: (result: EvaluationResult) => void;
|
|
2302
|
+
/** Score threshold for pass/fail (0-1). Default: 0.8 (DEFAULT_THRESHOLD). */
|
|
2303
|
+
readonly threshold?: number;
|
|
2304
|
+
/** Command(s) to run once before the suite starts. Same semantics as YAML before_all. */
|
|
2305
|
+
readonly beforeAll?: string | readonly string[];
|
|
2306
|
+
/** Suite-level cost cap in USD. Stops dispatching new tests when exceeded. */
|
|
2307
|
+
readonly budgetUsd?: number;
|
|
1929
2308
|
}
|
|
1930
2309
|
/**
|
|
1931
|
-
*
|
|
2310
|
+
* Summary statistics for an evaluation run.
|
|
1932
2311
|
*/
|
|
1933
|
-
interface
|
|
1934
|
-
|
|
1935
|
-
readonly
|
|
1936
|
-
|
|
1937
|
-
readonly
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
|
|
1941
|
-
readonly
|
|
1942
|
-
|
|
1943
|
-
readonly
|
|
1944
|
-
readonly cwd?: string;
|
|
1945
|
-
readonly timeoutMs?: number;
|
|
1946
|
-
readonly logDir?: string;
|
|
1947
|
-
readonly logFormat?: 'summary' | 'json';
|
|
1948
|
-
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1949
|
-
readonly streamLog?: false | 'raw' | 'summary';
|
|
1950
|
-
readonly systemPrompt?: string;
|
|
1951
|
-
}
|
|
1952
|
-
interface CopilotCliResolvedConfig {
|
|
1953
|
-
readonly executable: string;
|
|
1954
|
-
readonly model?: string;
|
|
1955
|
-
readonly args?: readonly string[];
|
|
1956
|
-
readonly cwd?: string;
|
|
1957
|
-
readonly timeoutMs?: number;
|
|
1958
|
-
readonly logDir?: string;
|
|
1959
|
-
readonly logFormat?: 'summary' | 'json';
|
|
1960
|
-
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1961
|
-
readonly streamLog?: false | 'raw' | 'summary';
|
|
1962
|
-
readonly systemPrompt?: string;
|
|
1963
|
-
}
|
|
1964
|
-
interface CopilotSdkResolvedConfig {
|
|
1965
|
-
readonly cliUrl?: string;
|
|
1966
|
-
readonly cliPath?: string;
|
|
1967
|
-
readonly githubToken?: string;
|
|
1968
|
-
readonly model?: string;
|
|
1969
|
-
readonly cwd?: string;
|
|
1970
|
-
readonly timeoutMs?: number;
|
|
1971
|
-
readonly logDir?: string;
|
|
1972
|
-
readonly logFormat?: 'summary' | 'json';
|
|
1973
|
-
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
1974
|
-
readonly streamLog?: false | 'raw' | 'summary';
|
|
1975
|
-
readonly systemPrompt?: string;
|
|
1976
|
-
/** BYOK provider type: "azure", "openai", or "anthropic". */
|
|
1977
|
-
readonly byokType?: string;
|
|
1978
|
-
/** BYOK base URL for the provider endpoint. */
|
|
1979
|
-
readonly byokBaseUrl?: string;
|
|
1980
|
-
/** BYOK API key for authenticating with the provider. */
|
|
1981
|
-
readonly byokApiKey?: string;
|
|
1982
|
-
/** BYOK bearer token (takes precedence over apiKey when set). */
|
|
1983
|
-
readonly byokBearerToken?: string;
|
|
1984
|
-
/** BYOK Azure API version (e.g. "2024-10-21"). Only used when byokType is "azure". */
|
|
1985
|
-
readonly byokApiVersion?: string;
|
|
1986
|
-
/** BYOK wire API format: "completions" or "responses". */
|
|
1987
|
-
readonly byokWireApi?: string;
|
|
1988
|
-
}
|
|
1989
|
-
interface CopilotLogResolvedConfig {
|
|
1990
|
-
/** Explicit path to a session directory containing events.jsonl. */
|
|
1991
|
-
readonly sessionDir?: string;
|
|
1992
|
-
/** Session UUID — combined with sessionStateDir to build the path. */
|
|
1993
|
-
readonly sessionId?: string;
|
|
1994
|
-
/** Auto-discovery mode. 'latest' picks the most recent session. */
|
|
1995
|
-
readonly discover?: 'latest';
|
|
1996
|
-
/** Override the default ~/.copilot/session-state directory. */
|
|
1997
|
-
readonly sessionStateDir?: string;
|
|
1998
|
-
/** Filter discovery by working directory. */
|
|
1999
|
-
readonly cwd?: string;
|
|
2000
|
-
}
|
|
2001
|
-
interface PiCodingAgentResolvedConfig {
|
|
2002
|
-
readonly subprovider?: string;
|
|
2003
|
-
readonly model?: string;
|
|
2004
|
-
readonly apiKey?: string;
|
|
2005
|
-
readonly baseUrl?: string;
|
|
2006
|
-
readonly tools?: string;
|
|
2007
|
-
readonly thinking?: string;
|
|
2008
|
-
readonly cwd?: string;
|
|
2009
|
-
readonly timeoutMs?: number;
|
|
2010
|
-
readonly logDir?: string;
|
|
2011
|
-
readonly logFormat?: 'summary' | 'json';
|
|
2012
|
-
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
2013
|
-
readonly streamLog?: false | 'raw' | 'summary';
|
|
2014
|
-
readonly systemPrompt?: string;
|
|
2015
|
-
}
|
|
2016
|
-
interface PiCliResolvedConfig {
|
|
2017
|
-
readonly executable: string;
|
|
2018
|
-
readonly subprovider?: string;
|
|
2019
|
-
readonly model?: string;
|
|
2020
|
-
readonly apiKey?: string;
|
|
2021
|
-
readonly baseUrl?: string;
|
|
2022
|
-
readonly tools?: string;
|
|
2023
|
-
readonly thinking?: string;
|
|
2024
|
-
readonly args?: readonly string[];
|
|
2025
|
-
readonly cwd?: string;
|
|
2026
|
-
readonly timeoutMs?: number;
|
|
2027
|
-
readonly logDir?: string;
|
|
2028
|
-
readonly logFormat?: 'summary' | 'json';
|
|
2029
|
-
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
2030
|
-
readonly streamLog?: false | 'raw' | 'summary';
|
|
2031
|
-
readonly systemPrompt?: string;
|
|
2312
|
+
interface EvalSummary {
|
|
2313
|
+
/** Total number of test cases */
|
|
2314
|
+
readonly total: number;
|
|
2315
|
+
/** Number of passing test cases (score >= threshold) */
|
|
2316
|
+
readonly passed: number;
|
|
2317
|
+
/** Number of failing test cases (score < threshold) */
|
|
2318
|
+
readonly failed: number;
|
|
2319
|
+
/** Total duration in milliseconds */
|
|
2320
|
+
readonly durationMs: number;
|
|
2321
|
+
/** Mean score across all cases */
|
|
2322
|
+
readonly meanScore: number;
|
|
2032
2323
|
}
|
|
2033
|
-
|
|
2034
|
-
|
|
2035
|
-
|
|
2036
|
-
|
|
2037
|
-
|
|
2038
|
-
readonly
|
|
2039
|
-
|
|
2040
|
-
readonly
|
|
2041
|
-
readonly logDir?: string;
|
|
2042
|
-
readonly logFormat?: 'summary' | 'json';
|
|
2043
|
-
/** New stream_log field. false=no stream log (default), 'raw'=per-event, 'summary'=consolidated. */
|
|
2044
|
-
readonly streamLog?: false | 'raw' | 'summary';
|
|
2324
|
+
/**
|
|
2325
|
+
* Result of an `evaluate()` call.
|
|
2326
|
+
*/
|
|
2327
|
+
interface EvalRunResult {
|
|
2328
|
+
/** Individual test case results */
|
|
2329
|
+
readonly results: readonly EvaluationResult[];
|
|
2330
|
+
/** Aggregate summary statistics */
|
|
2331
|
+
readonly summary: EvalSummary;
|
|
2045
2332
|
}
|
|
2046
|
-
|
|
2047
|
-
|
|
2048
|
-
|
|
2049
|
-
|
|
2050
|
-
|
|
2333
|
+
/**
|
|
2334
|
+
* Run an evaluation suite against a target provider.
|
|
2335
|
+
*
|
|
2336
|
+
* Accepts either inline test definitions or a path to an EVAL.yaml spec file.
|
|
2337
|
+
* The config shape mirrors the YAML structure — users can translate between
|
|
2338
|
+
* file-based and programmatic usage 1:1.
|
|
2339
|
+
*
|
|
2340
|
+
* @param config - Evaluation configuration
|
|
2341
|
+
* @returns Typed evaluation results with summary statistics
|
|
2342
|
+
*
|
|
2343
|
+
* @example Inline tests with assertions
|
|
2344
|
+
* ```typescript
|
|
2345
|
+
* const { results, summary } = await evaluate({
|
|
2346
|
+
* tests: [
|
|
2347
|
+
* {
|
|
2348
|
+
* id: 'greeting',
|
|
2349
|
+
* input: 'Say hello',
|
|
2350
|
+
* assert: [{ type: 'contains', value: 'hello' }],
|
|
2351
|
+
* },
|
|
2352
|
+
* ],
|
|
2353
|
+
* target: { provider: 'mock_agent' },
|
|
2354
|
+
* });
|
|
2355
|
+
* console.log(`${summary.passed}/${summary.total} passed`);
|
|
2356
|
+
* ```
|
|
2357
|
+
*
|
|
2358
|
+
* @example Load from YAML
|
|
2359
|
+
* ```typescript
|
|
2360
|
+
* const { summary } = await evaluate({
|
|
2361
|
+
* specFile: './evals/my-eval.yaml',
|
|
2362
|
+
* filter: 'greeting-*',
|
|
2363
|
+
* });
|
|
2364
|
+
* ```
|
|
2365
|
+
*/
|
|
2366
|
+
declare function evaluate(config: EvalConfig): Promise<EvalRunResult>;
|
|
2367
|
+
|
|
2368
|
+
interface TsEvalResult {
|
|
2369
|
+
readonly config: EvalConfig;
|
|
2370
|
+
readonly filePath: string;
|
|
2051
2371
|
}
|
|
2052
|
-
|
|
2053
|
-
|
|
2054
|
-
|
|
2055
|
-
|
|
2056
|
-
|
|
2057
|
-
|
|
2372
|
+
/**
|
|
2373
|
+
* Import a *.eval.ts file and extract the EvalConfig export.
|
|
2374
|
+
* Tries default, `config`, and `evalConfig` named exports in priority order.
|
|
2375
|
+
*/
|
|
2376
|
+
declare function loadTsEvalFile(filePath: string): Promise<TsEvalResult>;
|
|
2377
|
+
|
|
2378
|
+
/**
|
|
2379
|
+
* EVAL.yaml → evals.json transpiler.
|
|
2380
|
+
*
|
|
2381
|
+
* Converts an AgentV EVAL.yaml file into Agent Skills evals.json format
|
|
2382
|
+
* for consumption by the skill-creator pipeline.
|
|
2383
|
+
*
|
|
2384
|
+
* Handles both `assertions:` (current) and `assert:` (deprecated alias).
|
|
2385
|
+
*/
|
|
2386
|
+
interface EvalsJsonCase {
|
|
2387
|
+
id: number;
|
|
2388
|
+
prompt: string;
|
|
2389
|
+
expected_output?: string;
|
|
2390
|
+
files?: string[];
|
|
2391
|
+
should_trigger?: boolean;
|
|
2392
|
+
assertions: string[];
|
|
2058
2393
|
}
|
|
2059
|
-
interface
|
|
2060
|
-
|
|
2061
|
-
|
|
2394
|
+
interface EvalsJsonFile {
|
|
2395
|
+
skill_name: string;
|
|
2396
|
+
evals: EvalsJsonCase[];
|
|
2062
2397
|
}
|
|
2063
|
-
/**
|
|
2064
|
-
|
|
2065
|
-
|
|
2066
|
-
|
|
2067
|
-
|
|
2068
|
-
|
|
2069
|
-
|
|
2070
|
-
|
|
2071
|
-
|
|
2072
|
-
* to force CLI invocation even in subagent mode.
|
|
2073
|
-
*/
|
|
2074
|
-
readonly subagentModeAllowed?: boolean;
|
|
2075
|
-
/**
|
|
2076
|
-
* Ordered list of target names to try when the primary target fails after
|
|
2077
|
-
* exhausting retries. Each fallback is attempted in order.
|
|
2078
|
-
*/
|
|
2079
|
-
readonly fallbackTargets?: readonly string[];
|
|
2398
|
+
/**
|
|
2399
|
+
* Result of transpiling a single EVAL.yaml.
|
|
2400
|
+
* May produce multiple evals.json files (one per skill).
|
|
2401
|
+
*/
|
|
2402
|
+
interface TranspileResult {
|
|
2403
|
+
/** Map from skill_name → EvalsJsonFile */
|
|
2404
|
+
files: Map<string, EvalsJsonFile>;
|
|
2405
|
+
/** Warning messages accumulated during transpilation */
|
|
2406
|
+
warnings: string[];
|
|
2080
2407
|
}
|
|
2081
|
-
type ResolvedTarget = (ResolvedTargetBase & {
|
|
2082
|
-
readonly kind: 'openai';
|
|
2083
|
-
readonly config: OpenAIResolvedConfig;
|
|
2084
|
-
}) | (ResolvedTargetBase & {
|
|
2085
|
-
readonly kind: 'openrouter';
|
|
2086
|
-
readonly config: OpenRouterResolvedConfig;
|
|
2087
|
-
}) | (ResolvedTargetBase & {
|
|
2088
|
-
readonly kind: 'azure';
|
|
2089
|
-
readonly config: AzureResolvedConfig;
|
|
2090
|
-
}) | (ResolvedTargetBase & {
|
|
2091
|
-
readonly kind: 'anthropic';
|
|
2092
|
-
readonly config: AnthropicResolvedConfig;
|
|
2093
|
-
}) | (ResolvedTargetBase & {
|
|
2094
|
-
readonly kind: 'gemini';
|
|
2095
|
-
readonly config: GeminiResolvedConfig;
|
|
2096
|
-
}) | (ResolvedTargetBase & {
|
|
2097
|
-
readonly kind: 'codex';
|
|
2098
|
-
readonly config: CodexResolvedConfig;
|
|
2099
|
-
}) | (ResolvedTargetBase & {
|
|
2100
|
-
readonly kind: 'copilot-sdk';
|
|
2101
|
-
readonly config: CopilotSdkResolvedConfig;
|
|
2102
|
-
}) | (ResolvedTargetBase & {
|
|
2103
|
-
readonly kind: 'copilot-cli';
|
|
2104
|
-
readonly config: CopilotCliResolvedConfig;
|
|
2105
|
-
}) | (ResolvedTargetBase & {
|
|
2106
|
-
readonly kind: 'copilot-log';
|
|
2107
|
-
readonly config: CopilotLogResolvedConfig;
|
|
2108
|
-
}) | (ResolvedTargetBase & {
|
|
2109
|
-
readonly kind: 'pi-coding-agent';
|
|
2110
|
-
readonly config: PiCodingAgentResolvedConfig;
|
|
2111
|
-
}) | (ResolvedTargetBase & {
|
|
2112
|
-
readonly kind: 'pi-cli';
|
|
2113
|
-
readonly config: PiCliResolvedConfig;
|
|
2114
|
-
}) | (ResolvedTargetBase & {
|
|
2115
|
-
readonly kind: 'claude';
|
|
2116
|
-
readonly config: ClaudeResolvedConfig;
|
|
2117
|
-
}) | (ResolvedTargetBase & {
|
|
2118
|
-
readonly kind: 'claude-cli';
|
|
2119
|
-
readonly config: ClaudeResolvedConfig;
|
|
2120
|
-
}) | (ResolvedTargetBase & {
|
|
2121
|
-
readonly kind: 'claude-sdk';
|
|
2122
|
-
readonly config: ClaudeResolvedConfig;
|
|
2123
|
-
}) | (ResolvedTargetBase & {
|
|
2124
|
-
readonly kind: 'mock';
|
|
2125
|
-
readonly config: MockResolvedConfig;
|
|
2126
|
-
}) | (ResolvedTargetBase & {
|
|
2127
|
-
readonly kind: 'vscode' | 'vscode-insiders';
|
|
2128
|
-
readonly config: VSCodeResolvedConfig;
|
|
2129
|
-
}) | (ResolvedTargetBase & {
|
|
2130
|
-
readonly kind: 'agentv';
|
|
2131
|
-
readonly config: AgentVResolvedConfig;
|
|
2132
|
-
}) | (ResolvedTargetBase & {
|
|
2133
|
-
readonly kind: 'cli';
|
|
2134
|
-
readonly config: CliResolvedConfig;
|
|
2135
|
-
}) | (ResolvedTargetBase & {
|
|
2136
|
-
readonly kind: 'transcript';
|
|
2137
|
-
readonly config: Record<string, never>;
|
|
2138
|
-
});
|
|
2139
2408
|
/**
|
|
2140
|
-
*
|
|
2141
|
-
*
|
|
2142
|
-
*
|
|
2409
|
+
* Transpile a parsed EVAL.yaml object into one or more evals.json objects.
|
|
2410
|
+
*
|
|
2411
|
+
* @param suite Parsed YAML object (already loaded, no file I/O here)
|
|
2412
|
+
* @param source Source identifier for error messages (e.g. file path)
|
|
2143
2413
|
*/
|
|
2144
|
-
declare
|
|
2145
|
-
declare function resolveDelegatedTargetDefinition(name: string, definitions: ReadonlyMap<string, TargetDefinition>, env?: EnvLookup): TargetDefinition | undefined;
|
|
2146
|
-
declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup, evalFilePath?: string, options?: {
|
|
2147
|
-
readonly emitDeprecationWarnings?: boolean;
|
|
2148
|
-
}): ResolvedTarget;
|
|
2149
|
-
|
|
2414
|
+
declare function transpileEvalYaml(suite: unknown, source?: string): TranspileResult;
|
|
2150
2415
|
/**
|
|
2151
|
-
*
|
|
2416
|
+
* Transpile an EVAL.yaml file into one or more evals.json objects.
|
|
2417
|
+
* Returns a map from output filename → JSON content.
|
|
2152
2418
|
*
|
|
2153
|
-
*
|
|
2154
|
-
|
|
2155
|
-
|
|
2156
|
-
|
|
2419
|
+
* @param evalYamlPath Absolute path to the EVAL.yaml file
|
|
2420
|
+
*/
|
|
2421
|
+
declare function transpileEvalYamlFile(evalYamlPath: string): TranspileResult;
|
|
2422
|
+
/**
|
|
2423
|
+
* Determine the output filename(s) for a transpile result.
|
|
2424
|
+
* Single skill → "evals.json"
|
|
2425
|
+
* Multiple skills → "<skill>.evals.json"
|
|
2157
2426
|
*/
|
|
2427
|
+
declare function getOutputFilenames(result: TranspileResult): Map<string, string>;
|
|
2158
2428
|
|
|
2429
|
+
declare function fileExists(filePath: string): Promise<boolean>;
|
|
2159
2430
|
/**
|
|
2160
|
-
*
|
|
2431
|
+
* Normalize line endings to LF (\n).
|
|
2432
|
+
* This ensures consistent behavior across Windows (CRLF) and Unix (LF) systems.
|
|
2161
2433
|
*/
|
|
2162
|
-
|
|
2434
|
+
declare function normalizeLineEndings(content: string): string;
|
|
2163
2435
|
/**
|
|
2164
|
-
*
|
|
2165
|
-
*
|
|
2166
|
-
* Built-in providers are registered at startup. Custom providers can be
|
|
2167
|
-
* registered via the `register()` method.
|
|
2436
|
+
* Read a text file and normalize line endings to LF (\n).
|
|
2437
|
+
* This ensures consistent behavior across Windows (CRLF) and Unix (LF) systems.
|
|
2168
2438
|
*/
|
|
2169
|
-
declare
|
|
2170
|
-
|
|
2171
|
-
|
|
2172
|
-
|
|
2173
|
-
|
|
2174
|
-
|
|
2175
|
-
|
|
2176
|
-
|
|
2177
|
-
|
|
2178
|
-
|
|
2179
|
-
|
|
2180
|
-
|
|
2181
|
-
|
|
2182
|
-
|
|
2183
|
-
|
|
2184
|
-
|
|
2439
|
+
declare function readTextFile(filePath: string): Promise<string>;
|
|
2440
|
+
/**
|
|
2441
|
+
* Read a JSON file and parse it.
|
|
2442
|
+
*/
|
|
2443
|
+
declare function readJsonFile<T = unknown>(filePath: string): Promise<T>;
|
|
2444
|
+
/**
|
|
2445
|
+
* Find git repository root by walking up the directory tree.
|
|
2446
|
+
*/
|
|
2447
|
+
declare function findGitRoot(startPath: string): Promise<string | null>;
|
|
2448
|
+
/**
|
|
2449
|
+
* Build a chain of directories walking from a file's location up to repo root.
|
|
2450
|
+
* Used for discovering configuration files like targets.yaml or config.yaml.
|
|
2451
|
+
*/
|
|
2452
|
+
declare function buildDirectoryChain(filePath: string, repoRoot: string): readonly string[];
|
|
2453
|
+
/**
|
|
2454
|
+
* Build search roots for file resolution, matching yaml-parser behavior.
|
|
2455
|
+
* Searches from eval file directory up to repo root.
|
|
2456
|
+
*/
|
|
2457
|
+
declare function buildSearchRoots(evalPath: string, repoRoot: string): readonly string[];
|
|
2458
|
+
/**
|
|
2459
|
+
* Resolve a file reference using search roots, matching yaml-parser behavior.
|
|
2460
|
+
*/
|
|
2461
|
+
declare function resolveFileReference(rawValue: string, searchRoots: readonly string[]): Promise<{
|
|
2462
|
+
readonly displayPath: string;
|
|
2463
|
+
readonly resolvedPath?: string;
|
|
2464
|
+
readonly attempted: readonly string[];
|
|
2465
|
+
}>;
|
|
2185
2466
|
|
|
2186
2467
|
declare function readTargetDefinitions(filePath: string): Promise<readonly TargetDefinition[]>;
|
|
2187
2468
|
declare function listTargetNames(definitions: readonly TargetDefinition[]): readonly string[];
|
|
@@ -2631,26 +2912,26 @@ declare const rubricEvaluationSchema: z.ZodObject<{
|
|
|
2631
2912
|
reasoning: z.ZodString;
|
|
2632
2913
|
}, "strip", z.ZodTypeAny, {
|
|
2633
2914
|
id: string;
|
|
2634
|
-
reasoning: string;
|
|
2635
2915
|
satisfied: boolean;
|
|
2916
|
+
reasoning: string;
|
|
2636
2917
|
}, {
|
|
2637
2918
|
id: string;
|
|
2638
|
-
reasoning: string;
|
|
2639
2919
|
satisfied: boolean;
|
|
2920
|
+
reasoning: string;
|
|
2640
2921
|
}>, "many">;
|
|
2641
2922
|
overall_reasoning: z.ZodString;
|
|
2642
2923
|
}, "strip", z.ZodTypeAny, {
|
|
2643
2924
|
checks: {
|
|
2644
2925
|
id: string;
|
|
2645
|
-
reasoning: string;
|
|
2646
2926
|
satisfied: boolean;
|
|
2927
|
+
reasoning: string;
|
|
2647
2928
|
}[];
|
|
2648
2929
|
overall_reasoning: string;
|
|
2649
2930
|
}, {
|
|
2650
2931
|
checks: {
|
|
2651
2932
|
id: string;
|
|
2652
|
-
reasoning: string;
|
|
2653
2933
|
satisfied: boolean;
|
|
2934
|
+
reasoning: string;
|
|
2654
2935
|
}[];
|
|
2655
2936
|
overall_reasoning: string;
|
|
2656
2937
|
}>;
|
|
@@ -3105,244 +3386,6 @@ interface RunEvaluationOptions {
|
|
|
3105
3386
|
declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
|
|
3106
3387
|
declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
|
|
3107
3388
|
|
|
3108
|
-
/**
|
|
3109
|
-
* Types for inline assertion functions used in the evaluate() API.
|
|
3110
|
-
*
|
|
3111
|
-
* Inline functions are the escape hatch for custom evaluation logic
|
|
3112
|
-
* that doesn't fit a built-in grader type. For built-in assertions
|
|
3113
|
-
* (contains, regex, is-json, etc.), use config objects instead:
|
|
3114
|
-
*
|
|
3115
|
-
* assert: [{ type: 'contains', value: 'hello' }]
|
|
3116
|
-
*
|
|
3117
|
-
* Inline functions are for custom logic:
|
|
3118
|
-
*
|
|
3119
|
-
* assert: [({ output }) => ({ name: 'len', score: output.length > 5 ? 1 : 0 })]
|
|
3120
|
-
*/
|
|
3121
|
-
/** Context passed to inline assertion functions */
|
|
3122
|
-
interface AssertContext {
|
|
3123
|
-
readonly input: string;
|
|
3124
|
-
readonly output: string;
|
|
3125
|
-
readonly expectedOutput?: string;
|
|
3126
|
-
readonly criteria?: string;
|
|
3127
|
-
readonly metadata?: Record<string, unknown>;
|
|
3128
|
-
}
|
|
3129
|
-
/** Result from an inline assertion function */
|
|
3130
|
-
interface AssertResult {
|
|
3131
|
-
readonly name: string;
|
|
3132
|
-
readonly score: number;
|
|
3133
|
-
readonly metadata?: Record<string, unknown>;
|
|
3134
|
-
}
|
|
3135
|
-
/** Inline assertion function signature */
|
|
3136
|
-
type AssertFn = (ctx: AssertContext) => AssertResult | Promise<AssertResult>;
|
|
3137
|
-
|
|
3138
|
-
/**
|
|
3139
|
-
* Programmatic API for running evaluations.
|
|
3140
|
-
*
|
|
3141
|
-
* Provides `evaluate()` — a high-level function for using AgentV as a library
|
|
3142
|
-
* instead of a CLI. The config shape mirrors the YAML structure for easy
|
|
3143
|
-
* translation between file-based and programmatic usage.
|
|
3144
|
-
*
|
|
3145
|
-
* @example Inline tests with config objects
|
|
3146
|
-
* ```typescript
|
|
3147
|
-
* import { evaluate } from '@agentv/core';
|
|
3148
|
-
*
|
|
3149
|
-
* const results = await evaluate({
|
|
3150
|
-
* tests: [
|
|
3151
|
-
* {
|
|
3152
|
-
* id: 'capital',
|
|
3153
|
-
* input: 'What is the capital of France?',
|
|
3154
|
-
* expectedOutput: 'Paris',
|
|
3155
|
-
* assert: [{ type: 'contains', value: 'Paris' }],
|
|
3156
|
-
* },
|
|
3157
|
-
* ],
|
|
3158
|
-
* target: { provider: 'mock_agent' },
|
|
3159
|
-
* });
|
|
3160
|
-
*
|
|
3161
|
-
* console.log(results.summary.passed, 'passed');
|
|
3162
|
-
* ```
|
|
3163
|
-
*
|
|
3164
|
-
* @example Inline tests with task function and custom assertion
|
|
3165
|
-
* ```typescript
|
|
3166
|
-
* import { evaluate } from '@agentv/core';
|
|
3167
|
-
*
|
|
3168
|
-
* const { summary } = await evaluate({
|
|
3169
|
-
* tests: [
|
|
3170
|
-
* {
|
|
3171
|
-
* id: 'echo',
|
|
3172
|
-
* input: 'hello',
|
|
3173
|
-
* expectedOutput: 'Echo: hello',
|
|
3174
|
-
* assert: [
|
|
3175
|
-
* { type: 'contains', value: 'hello' },
|
|
3176
|
-
* { type: 'equals' },
|
|
3177
|
-
* ({ output }) => ({ name: 'custom', score: output.length > 0 ? 1 : 0 }),
|
|
3178
|
-
* ],
|
|
3179
|
-
* },
|
|
3180
|
-
* ],
|
|
3181
|
-
* task: async (input) => `Echo: ${input}`,
|
|
3182
|
-
* });
|
|
3183
|
-
* ```
|
|
3184
|
-
*
|
|
3185
|
-
* @example File-based
|
|
3186
|
-
* ```typescript
|
|
3187
|
-
* const results = await evaluate({
|
|
3188
|
-
* specFile: './evals/EVAL.yaml',
|
|
3189
|
-
* target: { provider: 'claude_agent' },
|
|
3190
|
-
* });
|
|
3191
|
-
* ```
|
|
3192
|
-
*
|
|
3193
|
-
* @module
|
|
3194
|
-
*/
|
|
3195
|
-
|
|
3196
|
-
/**
|
|
3197
|
-
* Inline test definition for the programmatic API.
|
|
3198
|
-
* Mirrors the YAML test structure.
|
|
3199
|
-
*/
|
|
3200
|
-
interface EvalTestInput {
|
|
3201
|
-
/** Unique test identifier */
|
|
3202
|
-
readonly id: string;
|
|
3203
|
-
/** What the response should accomplish */
|
|
3204
|
-
readonly criteria?: string;
|
|
3205
|
-
/** Input to the agent (string or message array) */
|
|
3206
|
-
readonly input: string | readonly {
|
|
3207
|
-
role: string;
|
|
3208
|
-
content: string;
|
|
3209
|
-
}[];
|
|
3210
|
-
/** Expected reference output (camelCase preferred) */
|
|
3211
|
-
readonly expectedOutput?: string;
|
|
3212
|
-
/** @deprecated Use `expectedOutput` instead */
|
|
3213
|
-
readonly expected_output?: string;
|
|
3214
|
-
/** Assertion graders — accepts factory functions, config objects, or inline functions */
|
|
3215
|
-
readonly assert?: readonly AssertEntry[];
|
|
3216
|
-
/** Arbitrary metadata */
|
|
3217
|
-
readonly metadata?: Record<string, unknown>;
|
|
3218
|
-
}
|
|
3219
|
-
/**
|
|
3220
|
-
* Inline assertion definition for the programmatic API.
|
|
3221
|
-
* Matches the YAML `assert` block structure.
|
|
3222
|
-
*/
|
|
3223
|
-
interface EvalAssertionInput {
|
|
3224
|
-
/** Assertion type (e.g., 'contains', 'llm-grader', 'code-grader') */
|
|
3225
|
-
readonly type: string;
|
|
3226
|
-
/** Display name */
|
|
3227
|
-
readonly name?: string;
|
|
3228
|
-
/** Value for deterministic assertions (contains, equals, regex) */
|
|
3229
|
-
readonly value?: string;
|
|
3230
|
-
/** Weight for scoring */
|
|
3231
|
-
readonly weight?: number;
|
|
3232
|
-
/** Whether this assertion is required to pass */
|
|
3233
|
-
readonly required?: boolean | number;
|
|
3234
|
-
/** Minimum score (0-1) for this evaluator to pass. Independent of `required` gate. */
|
|
3235
|
-
readonly min_score?: number;
|
|
3236
|
-
/** Prompt file for llm_grader */
|
|
3237
|
-
readonly prompt?: string;
|
|
3238
|
-
/** Script for code_grader */
|
|
3239
|
-
readonly script?: string | readonly string[];
|
|
3240
|
-
/** Additional config passed to the assertion */
|
|
3241
|
-
readonly config?: Record<string, unknown>;
|
|
3242
|
-
/** Nested assertions for composite type */
|
|
3243
|
-
readonly assert?: readonly EvalAssertionInput[];
|
|
3244
|
-
/** Rubric criteria for rubrics type */
|
|
3245
|
-
readonly criteria?: readonly (string | {
|
|
3246
|
-
id?: string;
|
|
3247
|
-
outcome: string;
|
|
3248
|
-
weight?: number;
|
|
3249
|
-
})[];
|
|
3250
|
-
/** Additional properties */
|
|
3251
|
-
readonly [key: string]: unknown;
|
|
3252
|
-
}
|
|
3253
|
-
/** Assert entry: inline function or config object */
|
|
3254
|
-
type AssertEntry = AssertFn | EvalAssertionInput;
|
|
3255
|
-
/**
|
|
3256
|
-
* Configuration for `evaluate()`.
|
|
3257
|
-
* Accepts either inline tests or a spec file path.
|
|
3258
|
-
*/
|
|
3259
|
-
interface EvalConfig {
|
|
3260
|
-
/** Inline test definitions (mutually exclusive with specFile) */
|
|
3261
|
-
readonly tests?: readonly EvalTestInput[];
|
|
3262
|
-
/** Path to an EVAL.yaml spec file (mutually exclusive with tests) */
|
|
3263
|
-
readonly specFile?: string;
|
|
3264
|
-
/** Target provider configuration */
|
|
3265
|
-
readonly target?: TargetDefinition;
|
|
3266
|
-
/** Custom task function — mutually exclusive with target */
|
|
3267
|
-
readonly task?: (input: string) => string | Promise<string>;
|
|
3268
|
-
/** Suite-level assertions applied to all tests */
|
|
3269
|
-
readonly assert?: readonly AssertEntry[];
|
|
3270
|
-
/** Filter tests by ID pattern(s) (glob supported). Arrays use OR logic. */
|
|
3271
|
-
readonly filter?: string | readonly string[];
|
|
3272
|
-
/** Maximum concurrent workers (default: 3) */
|
|
3273
|
-
readonly workers?: number;
|
|
3274
|
-
/** Maximum retries on failure (default: 2) */
|
|
3275
|
-
readonly maxRetries?: number;
|
|
3276
|
-
/** Agent timeout in milliseconds. No timeout if not set. */
|
|
3277
|
-
readonly agentTimeoutMs?: number;
|
|
3278
|
-
/** Enable response caching */
|
|
3279
|
-
readonly cache?: boolean;
|
|
3280
|
-
/** Verbose logging */
|
|
3281
|
-
readonly verbose?: boolean;
|
|
3282
|
-
/** Callback for each completed result */
|
|
3283
|
-
readonly onResult?: (result: EvaluationResult) => void;
|
|
3284
|
-
/** Score threshold for pass/fail (0-1). Default: 0.8 (DEFAULT_THRESHOLD). */
|
|
3285
|
-
readonly threshold?: number;
|
|
3286
|
-
}
|
|
3287
|
-
/**
|
|
3288
|
-
* Summary statistics for an evaluation run.
|
|
3289
|
-
*/
|
|
3290
|
-
interface EvalSummary {
|
|
3291
|
-
/** Total number of test cases */
|
|
3292
|
-
readonly total: number;
|
|
3293
|
-
/** Number of passing test cases (score >= threshold) */
|
|
3294
|
-
readonly passed: number;
|
|
3295
|
-
/** Number of failing test cases (score < threshold) */
|
|
3296
|
-
readonly failed: number;
|
|
3297
|
-
/** Total duration in milliseconds */
|
|
3298
|
-
readonly durationMs: number;
|
|
3299
|
-
/** Mean score across all cases */
|
|
3300
|
-
readonly meanScore: number;
|
|
3301
|
-
}
|
|
3302
|
-
/**
|
|
3303
|
-
* Result of an `evaluate()` call.
|
|
3304
|
-
*/
|
|
3305
|
-
interface EvalRunResult {
|
|
3306
|
-
/** Individual test case results */
|
|
3307
|
-
readonly results: readonly EvaluationResult[];
|
|
3308
|
-
/** Aggregate summary statistics */
|
|
3309
|
-
readonly summary: EvalSummary;
|
|
3310
|
-
}
|
|
3311
|
-
/**
|
|
3312
|
-
* Run an evaluation suite against a target provider.
|
|
3313
|
-
*
|
|
3314
|
-
* Accepts either inline test definitions or a path to an EVAL.yaml spec file.
|
|
3315
|
-
* The config shape mirrors the YAML structure — users can translate between
|
|
3316
|
-
* file-based and programmatic usage 1:1.
|
|
3317
|
-
*
|
|
3318
|
-
* @param config - Evaluation configuration
|
|
3319
|
-
* @returns Typed evaluation results with summary statistics
|
|
3320
|
-
*
|
|
3321
|
-
* @example Inline tests with assertions
|
|
3322
|
-
* ```typescript
|
|
3323
|
-
* const { results, summary } = await evaluate({
|
|
3324
|
-
* tests: [
|
|
3325
|
-
* {
|
|
3326
|
-
* id: 'greeting',
|
|
3327
|
-
* input: 'Say hello',
|
|
3328
|
-
* assert: [{ type: 'contains', value: 'hello' }],
|
|
3329
|
-
* },
|
|
3330
|
-
* ],
|
|
3331
|
-
* target: { provider: 'mock_agent' },
|
|
3332
|
-
* });
|
|
3333
|
-
* console.log(`${summary.passed}/${summary.total} passed`);
|
|
3334
|
-
* ```
|
|
3335
|
-
*
|
|
3336
|
-
* @example Load from YAML
|
|
3337
|
-
* ```typescript
|
|
3338
|
-
* const { summary } = await evaluate({
|
|
3339
|
-
* specFile: './evals/my-eval.yaml',
|
|
3340
|
-
* filter: 'greeting-*',
|
|
3341
|
-
* });
|
|
3342
|
-
* ```
|
|
3343
|
-
*/
|
|
3344
|
-
declare function evaluate(config: EvalConfig): Promise<EvalRunResult>;
|
|
3345
|
-
|
|
3346
3389
|
/**
|
|
3347
3390
|
* Typed configuration file support for AgentV.
|
|
3348
3391
|
*
|
|
@@ -4553,4 +4596,4 @@ type AgentKernel = {
|
|
|
4553
4596
|
};
|
|
4554
4597
|
declare function createAgentKernel(): AgentKernel;
|
|
4555
4598
|
|
|
4556
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
|
4599
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildGraderResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeGrader, type CodeGraderConfig, type CodeGraderOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeGrader, type CompositeGraderConfig, type CompositeGraderOptions, type ConfidenceIntervalAggregation, type ContainsAllGraderConfig, type ContainsAnyGraderConfig, type ContainsGraderConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type ConversationTurnInput, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostGrader, type CostGraderConfig, type CostGraderOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_GRADER_TEMPLATE, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionGrader, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithGraderConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsGraderConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsGrader, type ExecutionMetricsGraderConfig, type ExecutionMetricsGraderOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyGrader, type FieldAccuracyGraderConfig, type FieldAccuracyGraderOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type Grader, type GraderConfig, type GraderDispatchContext, type GraderFactory, type GraderFactoryFn, type GraderKind, GraderRegistry, type GraderResult, type IcontainsAllGraderConfig, type IcontainsAnyGraderConfig, type IcontainsGraderConfig, type InlineAssertEvaluatorConfig, type IsJsonGraderConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyGrader, type LatencyGraderConfig, type LatencyGraderOptions, LlmGrader, type LlmGraderConfig, type LlmGraderOptions, type LlmGraderPromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexGraderConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerGrader, type SkillTriggerGraderConfig, type StartsWithGraderConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageGrader, type TokenUsageGraderConfig, type TokenUsageGraderOptions, type ToolCall, type ToolTestMessage, type ToolTrajectoryExpectedItem, ToolTrajectoryGrader, type ToolTrajectoryGraderConfig, type ToolTrajectoryGraderOptions, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TsEvalResult, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isGraderKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, loadTsEvalFile, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|