@agentv/core 4.15.9-next.1 → 4.16.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-HVEQNYTC.js → chunk-6VZY3B6M.js} +55 -165
- package/dist/chunk-6VZY3B6M.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +18 -17
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +13 -12
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +329 -257
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +71 -25
- package/dist/index.d.ts +71 -25
- package/dist/index.js +249 -59
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-HVEQNYTC.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -96,7 +96,7 @@ interface ProviderRequest {
|
|
|
96
96
|
readonly temperature?: number;
|
|
97
97
|
readonly metadata?: JsonObject;
|
|
98
98
|
readonly signal?: AbortSignal;
|
|
99
|
-
/** Working directory override (e.g., from
|
|
99
|
+
/** Working directory override (e.g., from eval-level workspace.template) */
|
|
100
100
|
readonly cwd?: string;
|
|
101
101
|
/** VS Code .code-workspace file (resolved from workspace.template) */
|
|
102
102
|
readonly workspaceFile?: string;
|
|
@@ -265,7 +265,6 @@ interface TargetDefinition {
|
|
|
265
265
|
readonly wait?: boolean | unknown | undefined;
|
|
266
266
|
readonly dry_run?: boolean | unknown | undefined;
|
|
267
267
|
readonly subagent_root?: string | unknown | undefined;
|
|
268
|
-
readonly workspace_template?: string | unknown | undefined;
|
|
269
268
|
readonly files_format?: string | unknown | undefined;
|
|
270
269
|
readonly attachments_format?: string | unknown | undefined;
|
|
271
270
|
readonly env?: unknown | undefined;
|
|
@@ -630,6 +629,38 @@ type WorkspaceHooksConfig = {
|
|
|
630
629
|
/** Runs once after final test in the workspace lifecycle */
|
|
631
630
|
readonly after_all?: WorkspaceHookConfig;
|
|
632
631
|
};
|
|
632
|
+
/**
|
|
633
|
+
* Per-target hook configuration defined in eval files.
|
|
634
|
+
* Target hooks run setup/teardown scripts to customize the workspace for each target variant.
|
|
635
|
+
*
|
|
636
|
+
* Execution order relative to workspace hooks:
|
|
637
|
+
* - Setup: workspace before_all → target before_all → (per test: workspace before_each → target before_each)
|
|
638
|
+
* - Teardown: (per test: target after_each → workspace after_each) → target after_all → workspace after_all
|
|
639
|
+
*/
|
|
640
|
+
type TargetHooksConfig = {
|
|
641
|
+
/** Runs once before first test for this target */
|
|
642
|
+
readonly before_all?: WorkspaceHookConfig;
|
|
643
|
+
/** Runs before each test case for this target */
|
|
644
|
+
readonly before_each?: WorkspaceHookConfig;
|
|
645
|
+
/** Runs after each test case for this target */
|
|
646
|
+
readonly after_each?: WorkspaceHookConfig;
|
|
647
|
+
/** Runs once after final test for this target */
|
|
648
|
+
readonly after_all?: WorkspaceHookConfig;
|
|
649
|
+
};
|
|
650
|
+
/**
|
|
651
|
+
* Extended target reference from eval file.
|
|
652
|
+
* Allows eval files to define per-target hooks and delegation alongside target names.
|
|
653
|
+
*
|
|
654
|
+
* String targets are shorthand for `{ name: "target-name" }` (no hooks).
|
|
655
|
+
*/
|
|
656
|
+
type EvalTargetRef = {
|
|
657
|
+
/** Target name (must match a target in targets.yaml or be defined inline with use_target) */
|
|
658
|
+
readonly name: string;
|
|
659
|
+
/** Delegate to another named target (same as use_target in targets.yaml) */
|
|
660
|
+
readonly use_target?: string;
|
|
661
|
+
/** Per-target hooks for workspace customization */
|
|
662
|
+
readonly hooks?: TargetHooksConfig;
|
|
663
|
+
};
|
|
633
664
|
/**
|
|
634
665
|
* Docker-based workspace configuration.
|
|
635
666
|
* When present, code-grader commands run inside a Docker container
|
|
@@ -1377,7 +1408,7 @@ interface EvaluationResult {
|
|
|
1377
1408
|
readonly afterAllOutput?: string;
|
|
1378
1409
|
/** Captured output from workspace after_each script */
|
|
1379
1410
|
readonly afterEachOutput?: string;
|
|
1380
|
-
/** Unified diff of workspace file changes
|
|
1411
|
+
/** Unified diff of workspace file changes */
|
|
1381
1412
|
readonly fileChanges?: string;
|
|
1382
1413
|
/** Individual trial results (only present when trials.count > 1) */
|
|
1383
1414
|
readonly trials?: readonly TrialResult[];
|
|
@@ -1499,7 +1530,13 @@ declare function loadConfig(evalFilePath: string, repoRoot: string): Promise<Age
|
|
|
1499
1530
|
*/
|
|
1500
1531
|
declare function extractTargetFromSuite(suite: JsonObject): string | undefined;
|
|
1501
1532
|
/**
|
|
1502
|
-
* Extract
|
|
1533
|
+
* Extract target refs from parsed eval suite.
|
|
1534
|
+
* Supports both string shorthand and object form with hooks.
|
|
1535
|
+
* Returns undefined when no targets array is specified.
|
|
1536
|
+
*/
|
|
1537
|
+
declare function extractTargetRefsFromSuite(suite: JsonObject): readonly EvalTargetRef[] | undefined;
|
|
1538
|
+
/**
|
|
1539
|
+
* Extract target names from parsed eval suite (backward-compat wrapper).
|
|
1503
1540
|
* Precedence: execution.targets (array) > execution.target (singular).
|
|
1504
1541
|
* Returns undefined when no targets array is specified.
|
|
1505
1542
|
*/
|
|
@@ -1584,6 +1621,7 @@ type LoadOptions = {
|
|
|
1584
1621
|
declare function readTestSuiteMetadata(testFilePath: string): Promise<{
|
|
1585
1622
|
target?: string;
|
|
1586
1623
|
targets?: readonly string[];
|
|
1624
|
+
targetRefs?: readonly EvalTargetRef[];
|
|
1587
1625
|
trials?: TrialsConfig;
|
|
1588
1626
|
}>;
|
|
1589
1627
|
/**
|
|
@@ -1595,6 +1633,8 @@ type EvalSuiteResult = {
|
|
|
1595
1633
|
readonly trials?: TrialsConfig;
|
|
1596
1634
|
/** Suite-level targets from execution.targets (matrix evaluation) */
|
|
1597
1635
|
readonly targets?: readonly string[];
|
|
1636
|
+
/** Suite-level target refs with hooks from execution.targets (object form) */
|
|
1637
|
+
readonly targetRefs?: readonly EvalTargetRef[];
|
|
1598
1638
|
/** Suite-level workers from execution.workers */
|
|
1599
1639
|
readonly workers?: number;
|
|
1600
1640
|
/** Suite-level cache config from execution.cache */
|
|
@@ -1765,7 +1805,6 @@ declare const CliTargetConfigSchema: z.ZodObject<{
|
|
|
1765
1805
|
command: z.ZodString;
|
|
1766
1806
|
filesFormat: z.ZodOptional<z.ZodString>;
|
|
1767
1807
|
cwd: z.ZodOptional<z.ZodString>;
|
|
1768
|
-
workspaceTemplate: z.ZodOptional<z.ZodString>;
|
|
1769
1808
|
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
1770
1809
|
healthcheck: z.ZodOptional<z.ZodUnion<[z.ZodObject<{
|
|
1771
1810
|
url: z.ZodString;
|
|
@@ -1782,46 +1821,44 @@ declare const CliTargetConfigSchema: z.ZodObject<{
|
|
|
1782
1821
|
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
1783
1822
|
}, "strict", z.ZodTypeAny, {
|
|
1784
1823
|
command: string;
|
|
1785
|
-
cwd?: string | undefined;
|
|
1786
1824
|
timeoutMs?: number | undefined;
|
|
1825
|
+
cwd?: string | undefined;
|
|
1787
1826
|
}, {
|
|
1788
1827
|
command: string;
|
|
1789
|
-
cwd?: string | undefined;
|
|
1790
1828
|
timeoutMs?: number | undefined;
|
|
1829
|
+
cwd?: string | undefined;
|
|
1791
1830
|
}>]>>;
|
|
1792
1831
|
verbose: z.ZodOptional<z.ZodBoolean>;
|
|
1793
1832
|
keepTempFiles: z.ZodOptional<z.ZodBoolean>;
|
|
1794
1833
|
}, "strict", z.ZodTypeAny, {
|
|
1795
1834
|
command: string;
|
|
1796
|
-
|
|
1835
|
+
timeoutMs?: number | undefined;
|
|
1797
1836
|
cwd?: string | undefined;
|
|
1837
|
+
verbose?: boolean | undefined;
|
|
1798
1838
|
healthcheck?: {
|
|
1799
1839
|
url: string;
|
|
1800
1840
|
timeoutMs?: number | undefined;
|
|
1801
1841
|
} | {
|
|
1802
1842
|
command: string;
|
|
1803
|
-
cwd?: string | undefined;
|
|
1804
1843
|
timeoutMs?: number | undefined;
|
|
1844
|
+
cwd?: string | undefined;
|
|
1805
1845
|
} | undefined;
|
|
1806
|
-
timeoutMs?: number | undefined;
|
|
1807
1846
|
filesFormat?: string | undefined;
|
|
1808
|
-
workspaceTemplate?: string | undefined;
|
|
1809
1847
|
keepTempFiles?: boolean | undefined;
|
|
1810
1848
|
}, {
|
|
1811
1849
|
command: string;
|
|
1812
|
-
|
|
1850
|
+
timeoutMs?: number | undefined;
|
|
1813
1851
|
cwd?: string | undefined;
|
|
1852
|
+
verbose?: boolean | undefined;
|
|
1814
1853
|
healthcheck?: {
|
|
1815
1854
|
url: string;
|
|
1816
1855
|
timeoutMs?: number | undefined;
|
|
1817
1856
|
} | {
|
|
1818
1857
|
command: string;
|
|
1819
|
-
cwd?: string | undefined;
|
|
1820
1858
|
timeoutMs?: number | undefined;
|
|
1859
|
+
cwd?: string | undefined;
|
|
1821
1860
|
} | undefined;
|
|
1822
|
-
timeoutMs?: number | undefined;
|
|
1823
1861
|
filesFormat?: string | undefined;
|
|
1824
|
-
workspaceTemplate?: string | undefined;
|
|
1825
1862
|
keepTempFiles?: boolean | undefined;
|
|
1826
1863
|
}>;
|
|
1827
1864
|
type CliNormalizedConfig = z.infer<typeof CliTargetConfigSchema>;
|
|
@@ -1907,7 +1944,6 @@ interface CodexResolvedConfig {
|
|
|
1907
1944
|
readonly executable: string;
|
|
1908
1945
|
readonly args?: readonly string[];
|
|
1909
1946
|
readonly cwd?: string;
|
|
1910
|
-
readonly workspaceTemplate?: string;
|
|
1911
1947
|
readonly timeoutMs?: number;
|
|
1912
1948
|
readonly logDir?: string;
|
|
1913
1949
|
readonly logFormat?: 'summary' | 'json';
|
|
@@ -1920,7 +1956,6 @@ interface CopilotCliResolvedConfig {
|
|
|
1920
1956
|
readonly model?: string;
|
|
1921
1957
|
readonly args?: readonly string[];
|
|
1922
1958
|
readonly cwd?: string;
|
|
1923
|
-
readonly workspaceTemplate?: string;
|
|
1924
1959
|
readonly timeoutMs?: number;
|
|
1925
1960
|
readonly logDir?: string;
|
|
1926
1961
|
readonly logFormat?: 'summary' | 'json';
|
|
@@ -1934,7 +1969,6 @@ interface CopilotSdkResolvedConfig {
|
|
|
1934
1969
|
readonly githubToken?: string;
|
|
1935
1970
|
readonly model?: string;
|
|
1936
1971
|
readonly cwd?: string;
|
|
1937
|
-
readonly workspaceTemplate?: string;
|
|
1938
1972
|
readonly timeoutMs?: number;
|
|
1939
1973
|
readonly logDir?: string;
|
|
1940
1974
|
readonly logFormat?: 'summary' | 'json';
|
|
@@ -1974,7 +2008,6 @@ interface PiCodingAgentResolvedConfig {
|
|
|
1974
2008
|
readonly tools?: string;
|
|
1975
2009
|
readonly thinking?: string;
|
|
1976
2010
|
readonly cwd?: string;
|
|
1977
|
-
readonly workspaceTemplate?: string;
|
|
1978
2011
|
readonly timeoutMs?: number;
|
|
1979
2012
|
readonly logDir?: string;
|
|
1980
2013
|
readonly logFormat?: 'summary' | 'json';
|
|
@@ -1992,7 +2025,6 @@ interface PiCliResolvedConfig {
|
|
|
1992
2025
|
readonly thinking?: string;
|
|
1993
2026
|
readonly args?: readonly string[];
|
|
1994
2027
|
readonly cwd?: string;
|
|
1995
|
-
readonly workspaceTemplate?: string;
|
|
1996
2028
|
readonly timeoutMs?: number;
|
|
1997
2029
|
readonly logDir?: string;
|
|
1998
2030
|
readonly logFormat?: 'summary' | 'json';
|
|
@@ -2001,10 +2033,10 @@ interface PiCliResolvedConfig {
|
|
|
2001
2033
|
readonly systemPrompt?: string;
|
|
2002
2034
|
}
|
|
2003
2035
|
interface ClaudeResolvedConfig {
|
|
2036
|
+
readonly executable: string;
|
|
2004
2037
|
readonly model?: string;
|
|
2005
2038
|
readonly systemPrompt?: string;
|
|
2006
2039
|
readonly cwd?: string;
|
|
2007
|
-
readonly workspaceTemplate?: string;
|
|
2008
2040
|
readonly timeoutMs?: number;
|
|
2009
2041
|
readonly maxTurns?: number;
|
|
2010
2042
|
readonly maxBudgetUsd?: number;
|
|
@@ -2024,7 +2056,6 @@ interface VSCodeResolvedConfig {
|
|
|
2024
2056
|
readonly waitForResponse: boolean;
|
|
2025
2057
|
readonly dryRun: boolean;
|
|
2026
2058
|
readonly subagentRoot?: string;
|
|
2027
|
-
readonly workspaceTemplate?: string;
|
|
2028
2059
|
readonly timeoutMs?: number;
|
|
2029
2060
|
}
|
|
2030
2061
|
interface AgentVResolvedConfig {
|
|
@@ -2335,9 +2366,9 @@ interface EvaluationContext {
|
|
|
2335
2366
|
readonly targetResolver?: TargetResolver;
|
|
2336
2367
|
/** List of available target names for code graders */
|
|
2337
2368
|
readonly availableTargets?: readonly string[];
|
|
2338
|
-
/** Unified diff of file changes from workspace
|
|
2369
|
+
/** Unified diff of file changes from workspace */
|
|
2339
2370
|
readonly fileChanges?: string;
|
|
2340
|
-
/** Absolute path to the workspace directory
|
|
2371
|
+
/** Absolute path to the workspace directory */
|
|
2341
2372
|
readonly workspacePath?: string;
|
|
2342
2373
|
/** Docker workspace config: when present, code-grader commands run inside a container */
|
|
2343
2374
|
readonly dockerConfig?: DockerWorkspaceConfig;
|
|
@@ -3001,6 +3032,8 @@ interface RunEvalCaseOptions {
|
|
|
3001
3032
|
readonly threshold?: number;
|
|
3002
3033
|
/** Results from dependency tests (only present when the test has depends_on) */
|
|
3003
3034
|
readonly dependencyResults?: Readonly<Record<string, DependencyResult>>;
|
|
3035
|
+
/** Per-target hooks from eval file (before_all, before_each, after_each, after_all) */
|
|
3036
|
+
readonly targetHooks?: TargetHooksConfig;
|
|
3004
3037
|
}
|
|
3005
3038
|
interface ProgressEvent {
|
|
3006
3039
|
readonly workerId: number;
|
|
@@ -3068,6 +3101,8 @@ interface RunEvaluationOptions {
|
|
|
3068
3101
|
readonly model?: string;
|
|
3069
3102
|
/** Per-test score threshold for pass/fail (default: 0.8) */
|
|
3070
3103
|
readonly threshold?: number;
|
|
3104
|
+
/** Per-target hooks from eval file (before_all, before_each, after_each, after_all) */
|
|
3105
|
+
readonly targetHooks?: TargetHooksConfig;
|
|
3071
3106
|
}
|
|
3072
3107
|
declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
|
|
3073
3108
|
declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
|
|
@@ -3951,6 +3986,17 @@ declare function createDraftResultsPr(params: {
|
|
|
3951
3986
|
readonly body: string;
|
|
3952
3987
|
}): Promise<string>;
|
|
3953
3988
|
|
|
3989
|
+
/**
|
|
3990
|
+
* The default config directory (~/.agentv). Always resolves to the user's home
|
|
3991
|
+
* directory regardless of AGENTV_HOME. Used for lightweight, machine-local files
|
|
3992
|
+
* like version-check.json, last-config.json, and projects.yaml.
|
|
3993
|
+
*/
|
|
3994
|
+
declare function getAgentvConfigDir(): string;
|
|
3995
|
+
/**
|
|
3996
|
+
* The data root for heavy/large artifacts (workspaces, workspace-pool, subagents,
|
|
3997
|
+
* trace-state, cache, deps). Respects AGENTV_HOME override so users can relocate
|
|
3998
|
+
* bulky data to a different drive. Falls back to ~/.agentv when unset.
|
|
3999
|
+
*/
|
|
3954
4000
|
declare function getAgentvHome(): string;
|
|
3955
4001
|
declare function getWorkspacesRoot(): string;
|
|
3956
4002
|
declare function getSubagentsRoot(): string;
|
|
@@ -4509,4 +4555,4 @@ type AgentKernel = {
|
|
|
4509
4555
|
};
|
|
4510
4556
|
declare function createAgentKernel(): AgentKernel;
|
|
4511
4557
|
|
|
4512
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
|
4558
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
package/dist/index.d.ts
CHANGED
|
@@ -96,7 +96,7 @@ interface ProviderRequest {
|
|
|
96
96
|
readonly temperature?: number;
|
|
97
97
|
readonly metadata?: JsonObject;
|
|
98
98
|
readonly signal?: AbortSignal;
|
|
99
|
-
/** Working directory override (e.g., from
|
|
99
|
+
/** Working directory override (e.g., from eval-level workspace.template) */
|
|
100
100
|
readonly cwd?: string;
|
|
101
101
|
/** VS Code .code-workspace file (resolved from workspace.template) */
|
|
102
102
|
readonly workspaceFile?: string;
|
|
@@ -265,7 +265,6 @@ interface TargetDefinition {
|
|
|
265
265
|
readonly wait?: boolean | unknown | undefined;
|
|
266
266
|
readonly dry_run?: boolean | unknown | undefined;
|
|
267
267
|
readonly subagent_root?: string | unknown | undefined;
|
|
268
|
-
readonly workspace_template?: string | unknown | undefined;
|
|
269
268
|
readonly files_format?: string | unknown | undefined;
|
|
270
269
|
readonly attachments_format?: string | unknown | undefined;
|
|
271
270
|
readonly env?: unknown | undefined;
|
|
@@ -630,6 +629,38 @@ type WorkspaceHooksConfig = {
|
|
|
630
629
|
/** Runs once after final test in the workspace lifecycle */
|
|
631
630
|
readonly after_all?: WorkspaceHookConfig;
|
|
632
631
|
};
|
|
632
|
+
/**
|
|
633
|
+
* Per-target hook configuration defined in eval files.
|
|
634
|
+
* Target hooks run setup/teardown scripts to customize the workspace for each target variant.
|
|
635
|
+
*
|
|
636
|
+
* Execution order relative to workspace hooks:
|
|
637
|
+
* - Setup: workspace before_all → target before_all → (per test: workspace before_each → target before_each)
|
|
638
|
+
* - Teardown: (per test: target after_each → workspace after_each) → target after_all → workspace after_all
|
|
639
|
+
*/
|
|
640
|
+
type TargetHooksConfig = {
|
|
641
|
+
/** Runs once before first test for this target */
|
|
642
|
+
readonly before_all?: WorkspaceHookConfig;
|
|
643
|
+
/** Runs before each test case for this target */
|
|
644
|
+
readonly before_each?: WorkspaceHookConfig;
|
|
645
|
+
/** Runs after each test case for this target */
|
|
646
|
+
readonly after_each?: WorkspaceHookConfig;
|
|
647
|
+
/** Runs once after final test for this target */
|
|
648
|
+
readonly after_all?: WorkspaceHookConfig;
|
|
649
|
+
};
|
|
650
|
+
/**
|
|
651
|
+
* Extended target reference from eval file.
|
|
652
|
+
* Allows eval files to define per-target hooks and delegation alongside target names.
|
|
653
|
+
*
|
|
654
|
+
* String targets are shorthand for `{ name: "target-name" }` (no hooks).
|
|
655
|
+
*/
|
|
656
|
+
type EvalTargetRef = {
|
|
657
|
+
/** Target name (must match a target in targets.yaml or be defined inline with use_target) */
|
|
658
|
+
readonly name: string;
|
|
659
|
+
/** Delegate to another named target (same as use_target in targets.yaml) */
|
|
660
|
+
readonly use_target?: string;
|
|
661
|
+
/** Per-target hooks for workspace customization */
|
|
662
|
+
readonly hooks?: TargetHooksConfig;
|
|
663
|
+
};
|
|
633
664
|
/**
|
|
634
665
|
* Docker-based workspace configuration.
|
|
635
666
|
* When present, code-grader commands run inside a Docker container
|
|
@@ -1377,7 +1408,7 @@ interface EvaluationResult {
|
|
|
1377
1408
|
readonly afterAllOutput?: string;
|
|
1378
1409
|
/** Captured output from workspace after_each script */
|
|
1379
1410
|
readonly afterEachOutput?: string;
|
|
1380
|
-
/** Unified diff of workspace file changes
|
|
1411
|
+
/** Unified diff of workspace file changes */
|
|
1381
1412
|
readonly fileChanges?: string;
|
|
1382
1413
|
/** Individual trial results (only present when trials.count > 1) */
|
|
1383
1414
|
readonly trials?: readonly TrialResult[];
|
|
@@ -1499,7 +1530,13 @@ declare function loadConfig(evalFilePath: string, repoRoot: string): Promise<Age
|
|
|
1499
1530
|
*/
|
|
1500
1531
|
declare function extractTargetFromSuite(suite: JsonObject): string | undefined;
|
|
1501
1532
|
/**
|
|
1502
|
-
* Extract
|
|
1533
|
+
* Extract target refs from parsed eval suite.
|
|
1534
|
+
* Supports both string shorthand and object form with hooks.
|
|
1535
|
+
* Returns undefined when no targets array is specified.
|
|
1536
|
+
*/
|
|
1537
|
+
declare function extractTargetRefsFromSuite(suite: JsonObject): readonly EvalTargetRef[] | undefined;
|
|
1538
|
+
/**
|
|
1539
|
+
* Extract target names from parsed eval suite (backward-compat wrapper).
|
|
1503
1540
|
* Precedence: execution.targets (array) > execution.target (singular).
|
|
1504
1541
|
* Returns undefined when no targets array is specified.
|
|
1505
1542
|
*/
|
|
@@ -1584,6 +1621,7 @@ type LoadOptions = {
|
|
|
1584
1621
|
declare function readTestSuiteMetadata(testFilePath: string): Promise<{
|
|
1585
1622
|
target?: string;
|
|
1586
1623
|
targets?: readonly string[];
|
|
1624
|
+
targetRefs?: readonly EvalTargetRef[];
|
|
1587
1625
|
trials?: TrialsConfig;
|
|
1588
1626
|
}>;
|
|
1589
1627
|
/**
|
|
@@ -1595,6 +1633,8 @@ type EvalSuiteResult = {
|
|
|
1595
1633
|
readonly trials?: TrialsConfig;
|
|
1596
1634
|
/** Suite-level targets from execution.targets (matrix evaluation) */
|
|
1597
1635
|
readonly targets?: readonly string[];
|
|
1636
|
+
/** Suite-level target refs with hooks from execution.targets (object form) */
|
|
1637
|
+
readonly targetRefs?: readonly EvalTargetRef[];
|
|
1598
1638
|
/** Suite-level workers from execution.workers */
|
|
1599
1639
|
readonly workers?: number;
|
|
1600
1640
|
/** Suite-level cache config from execution.cache */
|
|
@@ -1765,7 +1805,6 @@ declare const CliTargetConfigSchema: z.ZodObject<{
|
|
|
1765
1805
|
command: z.ZodString;
|
|
1766
1806
|
filesFormat: z.ZodOptional<z.ZodString>;
|
|
1767
1807
|
cwd: z.ZodOptional<z.ZodString>;
|
|
1768
|
-
workspaceTemplate: z.ZodOptional<z.ZodString>;
|
|
1769
1808
|
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
1770
1809
|
healthcheck: z.ZodOptional<z.ZodUnion<[z.ZodObject<{
|
|
1771
1810
|
url: z.ZodString;
|
|
@@ -1782,46 +1821,44 @@ declare const CliTargetConfigSchema: z.ZodObject<{
|
|
|
1782
1821
|
timeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
1783
1822
|
}, "strict", z.ZodTypeAny, {
|
|
1784
1823
|
command: string;
|
|
1785
|
-
cwd?: string | undefined;
|
|
1786
1824
|
timeoutMs?: number | undefined;
|
|
1825
|
+
cwd?: string | undefined;
|
|
1787
1826
|
}, {
|
|
1788
1827
|
command: string;
|
|
1789
|
-
cwd?: string | undefined;
|
|
1790
1828
|
timeoutMs?: number | undefined;
|
|
1829
|
+
cwd?: string | undefined;
|
|
1791
1830
|
}>]>>;
|
|
1792
1831
|
verbose: z.ZodOptional<z.ZodBoolean>;
|
|
1793
1832
|
keepTempFiles: z.ZodOptional<z.ZodBoolean>;
|
|
1794
1833
|
}, "strict", z.ZodTypeAny, {
|
|
1795
1834
|
command: string;
|
|
1796
|
-
|
|
1835
|
+
timeoutMs?: number | undefined;
|
|
1797
1836
|
cwd?: string | undefined;
|
|
1837
|
+
verbose?: boolean | undefined;
|
|
1798
1838
|
healthcheck?: {
|
|
1799
1839
|
url: string;
|
|
1800
1840
|
timeoutMs?: number | undefined;
|
|
1801
1841
|
} | {
|
|
1802
1842
|
command: string;
|
|
1803
|
-
cwd?: string | undefined;
|
|
1804
1843
|
timeoutMs?: number | undefined;
|
|
1844
|
+
cwd?: string | undefined;
|
|
1805
1845
|
} | undefined;
|
|
1806
|
-
timeoutMs?: number | undefined;
|
|
1807
1846
|
filesFormat?: string | undefined;
|
|
1808
|
-
workspaceTemplate?: string | undefined;
|
|
1809
1847
|
keepTempFiles?: boolean | undefined;
|
|
1810
1848
|
}, {
|
|
1811
1849
|
command: string;
|
|
1812
|
-
|
|
1850
|
+
timeoutMs?: number | undefined;
|
|
1813
1851
|
cwd?: string | undefined;
|
|
1852
|
+
verbose?: boolean | undefined;
|
|
1814
1853
|
healthcheck?: {
|
|
1815
1854
|
url: string;
|
|
1816
1855
|
timeoutMs?: number | undefined;
|
|
1817
1856
|
} | {
|
|
1818
1857
|
command: string;
|
|
1819
|
-
cwd?: string | undefined;
|
|
1820
1858
|
timeoutMs?: number | undefined;
|
|
1859
|
+
cwd?: string | undefined;
|
|
1821
1860
|
} | undefined;
|
|
1822
|
-
timeoutMs?: number | undefined;
|
|
1823
1861
|
filesFormat?: string | undefined;
|
|
1824
|
-
workspaceTemplate?: string | undefined;
|
|
1825
1862
|
keepTempFiles?: boolean | undefined;
|
|
1826
1863
|
}>;
|
|
1827
1864
|
type CliNormalizedConfig = z.infer<typeof CliTargetConfigSchema>;
|
|
@@ -1907,7 +1944,6 @@ interface CodexResolvedConfig {
|
|
|
1907
1944
|
readonly executable: string;
|
|
1908
1945
|
readonly args?: readonly string[];
|
|
1909
1946
|
readonly cwd?: string;
|
|
1910
|
-
readonly workspaceTemplate?: string;
|
|
1911
1947
|
readonly timeoutMs?: number;
|
|
1912
1948
|
readonly logDir?: string;
|
|
1913
1949
|
readonly logFormat?: 'summary' | 'json';
|
|
@@ -1920,7 +1956,6 @@ interface CopilotCliResolvedConfig {
|
|
|
1920
1956
|
readonly model?: string;
|
|
1921
1957
|
readonly args?: readonly string[];
|
|
1922
1958
|
readonly cwd?: string;
|
|
1923
|
-
readonly workspaceTemplate?: string;
|
|
1924
1959
|
readonly timeoutMs?: number;
|
|
1925
1960
|
readonly logDir?: string;
|
|
1926
1961
|
readonly logFormat?: 'summary' | 'json';
|
|
@@ -1934,7 +1969,6 @@ interface CopilotSdkResolvedConfig {
|
|
|
1934
1969
|
readonly githubToken?: string;
|
|
1935
1970
|
readonly model?: string;
|
|
1936
1971
|
readonly cwd?: string;
|
|
1937
|
-
readonly workspaceTemplate?: string;
|
|
1938
1972
|
readonly timeoutMs?: number;
|
|
1939
1973
|
readonly logDir?: string;
|
|
1940
1974
|
readonly logFormat?: 'summary' | 'json';
|
|
@@ -1974,7 +2008,6 @@ interface PiCodingAgentResolvedConfig {
|
|
|
1974
2008
|
readonly tools?: string;
|
|
1975
2009
|
readonly thinking?: string;
|
|
1976
2010
|
readonly cwd?: string;
|
|
1977
|
-
readonly workspaceTemplate?: string;
|
|
1978
2011
|
readonly timeoutMs?: number;
|
|
1979
2012
|
readonly logDir?: string;
|
|
1980
2013
|
readonly logFormat?: 'summary' | 'json';
|
|
@@ -1992,7 +2025,6 @@ interface PiCliResolvedConfig {
|
|
|
1992
2025
|
readonly thinking?: string;
|
|
1993
2026
|
readonly args?: readonly string[];
|
|
1994
2027
|
readonly cwd?: string;
|
|
1995
|
-
readonly workspaceTemplate?: string;
|
|
1996
2028
|
readonly timeoutMs?: number;
|
|
1997
2029
|
readonly logDir?: string;
|
|
1998
2030
|
readonly logFormat?: 'summary' | 'json';
|
|
@@ -2001,10 +2033,10 @@ interface PiCliResolvedConfig {
|
|
|
2001
2033
|
readonly systemPrompt?: string;
|
|
2002
2034
|
}
|
|
2003
2035
|
interface ClaudeResolvedConfig {
|
|
2036
|
+
readonly executable: string;
|
|
2004
2037
|
readonly model?: string;
|
|
2005
2038
|
readonly systemPrompt?: string;
|
|
2006
2039
|
readonly cwd?: string;
|
|
2007
|
-
readonly workspaceTemplate?: string;
|
|
2008
2040
|
readonly timeoutMs?: number;
|
|
2009
2041
|
readonly maxTurns?: number;
|
|
2010
2042
|
readonly maxBudgetUsd?: number;
|
|
@@ -2024,7 +2056,6 @@ interface VSCodeResolvedConfig {
|
|
|
2024
2056
|
readonly waitForResponse: boolean;
|
|
2025
2057
|
readonly dryRun: boolean;
|
|
2026
2058
|
readonly subagentRoot?: string;
|
|
2027
|
-
readonly workspaceTemplate?: string;
|
|
2028
2059
|
readonly timeoutMs?: number;
|
|
2029
2060
|
}
|
|
2030
2061
|
interface AgentVResolvedConfig {
|
|
@@ -2335,9 +2366,9 @@ interface EvaluationContext {
|
|
|
2335
2366
|
readonly targetResolver?: TargetResolver;
|
|
2336
2367
|
/** List of available target names for code graders */
|
|
2337
2368
|
readonly availableTargets?: readonly string[];
|
|
2338
|
-
/** Unified diff of file changes from workspace
|
|
2369
|
+
/** Unified diff of file changes from workspace */
|
|
2339
2370
|
readonly fileChanges?: string;
|
|
2340
|
-
/** Absolute path to the workspace directory
|
|
2371
|
+
/** Absolute path to the workspace directory */
|
|
2341
2372
|
readonly workspacePath?: string;
|
|
2342
2373
|
/** Docker workspace config: when present, code-grader commands run inside a container */
|
|
2343
2374
|
readonly dockerConfig?: DockerWorkspaceConfig;
|
|
@@ -3001,6 +3032,8 @@ interface RunEvalCaseOptions {
|
|
|
3001
3032
|
readonly threshold?: number;
|
|
3002
3033
|
/** Results from dependency tests (only present when the test has depends_on) */
|
|
3003
3034
|
readonly dependencyResults?: Readonly<Record<string, DependencyResult>>;
|
|
3035
|
+
/** Per-target hooks from eval file (before_all, before_each, after_each, after_all) */
|
|
3036
|
+
readonly targetHooks?: TargetHooksConfig;
|
|
3004
3037
|
}
|
|
3005
3038
|
interface ProgressEvent {
|
|
3006
3039
|
readonly workerId: number;
|
|
@@ -3068,6 +3101,8 @@ interface RunEvaluationOptions {
|
|
|
3068
3101
|
readonly model?: string;
|
|
3069
3102
|
/** Per-test score threshold for pass/fail (default: 0.8) */
|
|
3070
3103
|
readonly threshold?: number;
|
|
3104
|
+
/** Per-target hooks from eval file (before_all, before_each, after_each, after_all) */
|
|
3105
|
+
readonly targetHooks?: TargetHooksConfig;
|
|
3071
3106
|
}
|
|
3072
3107
|
declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
|
|
3073
3108
|
declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
|
|
@@ -3951,6 +3986,17 @@ declare function createDraftResultsPr(params: {
|
|
|
3951
3986
|
readonly body: string;
|
|
3952
3987
|
}): Promise<string>;
|
|
3953
3988
|
|
|
3989
|
+
/**
|
|
3990
|
+
* The default config directory (~/.agentv). Always resolves to the user's home
|
|
3991
|
+
* directory regardless of AGENTV_HOME. Used for lightweight, machine-local files
|
|
3992
|
+
* like version-check.json, last-config.json, and projects.yaml.
|
|
3993
|
+
*/
|
|
3994
|
+
declare function getAgentvConfigDir(): string;
|
|
3995
|
+
/**
|
|
3996
|
+
* The data root for heavy/large artifacts (workspaces, workspace-pool, subagents,
|
|
3997
|
+
* trace-state, cache, deps). Respects AGENTV_HOME override so users can relocate
|
|
3998
|
+
* bulky data to a different drive. Falls back to ~/.agentv when unset.
|
|
3999
|
+
*/
|
|
3954
4000
|
declare function getAgentvHome(): string;
|
|
3955
4001
|
declare function getWorkspacesRoot(): string;
|
|
3956
4002
|
declare function getSubagentsRoot(): string;
|
|
@@ -4509,4 +4555,4 @@ type AgentKernel = {
|
|
|
4509
4555
|
};
|
|
4510
4556
|
declare function createAgentKernel(): AgentKernel;
|
|
4511
4557
|
|
|
4512
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
|
4558
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|