npm - @agentv/core - Versions diffs - 4.15.9-next.1 → 4.16.0-next.1 - Mend

@agentv/core 4.15.9-next.1 → 4.16.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/{chunk-HVEQNYTC.js → chunk-6VZY3B6M.js} +55 -165
package/dist/chunk-6VZY3B6M.js.map +1 -0
package/dist/evaluation/validation/index.cjs +18 -17
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +13 -12
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +329 -257
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +71 -25
package/dist/index.d.ts +71 -25
package/dist/index.js +249 -59
package/dist/index.js.map +1 -1
package/package.json +1 -1
package/dist/chunk-HVEQNYTC.js.map +0 -1

package/dist/index.d.cts CHANGED Viewed

@@ -96,7 +96,7 @@ interface ProviderRequest {
     readonly temperature?: number;
     readonly metadata?: JsonObject;
     readonly signal?: AbortSignal;
-    /** Working directory override (e.g., from workspace_template) */
+    /** Working directory override (e.g., from eval-level workspace.template) */
     readonly cwd?: string;
     /** VS Code .code-workspace file (resolved from workspace.template) */
     readonly workspaceFile?: string;
@@ -265,7 +265,6 @@ interface TargetDefinition {
     readonly wait?: boolean | unknown | undefined;
     readonly dry_run?: boolean | unknown | undefined;
     readonly subagent_root?: string | unknown | undefined;
-    readonly workspace_template?: string | unknown | undefined;
     readonly files_format?: string | unknown | undefined;
     readonly attachments_format?: string | unknown | undefined;
     readonly env?: unknown | undefined;
@@ -630,6 +629,38 @@ type WorkspaceHooksConfig = {
     /** Runs once after final test in the workspace lifecycle */
     readonly after_all?: WorkspaceHookConfig;
 };
+/**
+ * Per-target hook configuration defined in eval files.
+ * Target hooks run setup/teardown scripts to customize the workspace for each target variant.
+ *
+ * Execution order relative to workspace hooks:
+ * - Setup: workspace before_all → target before_all → (per test: workspace before_each → target before_each)
+ * - Teardown: (per test: target after_each → workspace after_each) → target after_all → workspace after_all
+ */
+type TargetHooksConfig = {
+    /** Runs once before first test for this target */
+    readonly before_all?: WorkspaceHookConfig;
+    /** Runs before each test case for this target */
+    readonly before_each?: WorkspaceHookConfig;
+    /** Runs after each test case for this target */
+    readonly after_each?: WorkspaceHookConfig;
+    /** Runs once after final test for this target */
+    readonly after_all?: WorkspaceHookConfig;
+};
+/**
+ * Extended target reference from eval file.
+ * Allows eval files to define per-target hooks and delegation alongside target names.
+ *
+ * String targets are shorthand for `{ name: "target-name" }` (no hooks).
+ */
+type EvalTargetRef = {
+    /** Target name (must match a target in targets.yaml or be defined inline with use_target) */
+    readonly name: string;
+    /** Delegate to another named target (same as use_target in targets.yaml) */
+    readonly use_target?: string;
+    /** Per-target hooks for workspace customization */
+    readonly hooks?: TargetHooksConfig;
+};
 /**
  * Docker-based workspace configuration.
  * When present, code-grader commands run inside a Docker container
@@ -1377,7 +1408,7 @@ interface EvaluationResult {
     readonly afterAllOutput?: string;
     /** Captured output from workspace after_each script */
     readonly afterEachOutput?: string;
-    /** Unified diff of workspace file changes (when workspace_template is configured) */
+    /** Unified diff of workspace file changes */
     readonly fileChanges?: string;
     /** Individual trial results (only present when trials.count > 1) */
     readonly trials?: readonly TrialResult[];
@@ -1499,7 +1530,13 @@ declare function loadConfig(evalFilePath: string, repoRoot: string): Promise<Age
  */
 declare function extractTargetFromSuite(suite: JsonObject): string | undefined;
 /**
- * Extract targets array from parsed eval suite.
+ * Extract target refs from parsed eval suite.
+ * Supports both string shorthand and object form with hooks.
+ * Returns undefined when no targets array is specified.
+ */
+declare function extractTargetRefsFromSuite(suite: JsonObject): readonly EvalTargetRef[] | undefined;
+/**
+ * Extract target names from parsed eval suite (backward-compat wrapper).
  * Precedence: execution.targets (array) > execution.target (singular).
  * Returns undefined when no targets array is specified.
  */
@@ -1584,6 +1621,7 @@ type LoadOptions = {
 declare function readTestSuiteMetadata(testFilePath: string): Promise<{
     target?: string;
     targets?: readonly string[];
+    targetRefs?: readonly EvalTargetRef[];
     trials?: TrialsConfig;
 }>;
 /**
@@ -1595,6 +1633,8 @@ type EvalSuiteResult = {
     readonly trials?: TrialsConfig;
     /** Suite-level targets from execution.targets (matrix evaluation) */
     readonly targets?: readonly string[];
+    /** Suite-level target refs with hooks from execution.targets (object form) */
+    readonly targetRefs?: readonly EvalTargetRef[];
     /** Suite-level workers from execution.workers */
     readonly workers?: number;
     /** Suite-level cache config from execution.cache */
@@ -1765,7 +1805,6 @@ declare const CliTargetConfigSchema: z.ZodObject<{
     command: z.ZodString;
     filesFormat: z.ZodOptional<z.ZodString>;
     cwd: z.ZodOptional<z.ZodString>;
-    workspaceTemplate: z.ZodOptional<z.ZodString>;
     timeoutMs: z.ZodOptional<z.ZodNumber>;
     healthcheck: z.ZodOptional<z.ZodUnion<[z.ZodObject<{
         url: z.ZodString;
@@ -1782,46 +1821,44 @@ declare const CliTargetConfigSchema: z.ZodObject<{
         timeoutMs: z.ZodOptional<z.ZodNumber>;
     }, "strict", z.ZodTypeAny, {
         command: string;
-        cwd?: string | undefined;
         timeoutMs?: number | undefined;
+        cwd?: string | undefined;
     }, {
         command: string;
-        cwd?: string | undefined;
         timeoutMs?: number | undefined;
+        cwd?: string | undefined;
     }>]>>;
     verbose: z.ZodOptional<z.ZodBoolean>;
     keepTempFiles: z.ZodOptional<z.ZodBoolean>;
 }, "strict", z.ZodTypeAny, {
     command: string;
-    verbose?: boolean | undefined;
+    timeoutMs?: number | undefined;
     cwd?: string | undefined;
+    verbose?: boolean | undefined;
     healthcheck?: {
         url: string;
         timeoutMs?: number | undefined;
     } | {
         command: string;
-        cwd?: string | undefined;
         timeoutMs?: number | undefined;
+        cwd?: string | undefined;
     } | undefined;
-    timeoutMs?: number | undefined;
     filesFormat?: string | undefined;
-    workspaceTemplate?: string | undefined;
     keepTempFiles?: boolean | undefined;
 }, {
     command: string;
-    verbose?: boolean | undefined;
+    timeoutMs?: number | undefined;
     cwd?: string | undefined;
+    verbose?: boolean | undefined;
     healthcheck?: {
         url: string;
         timeoutMs?: number | undefined;
     } | {
         command: string;
-        cwd?: string | undefined;
         timeoutMs?: number | undefined;
+        cwd?: string | undefined;
     } | undefined;
-    timeoutMs?: number | undefined;
     filesFormat?: string | undefined;
-    workspaceTemplate?: string | undefined;
     keepTempFiles?: boolean | undefined;
 }>;
 type CliNormalizedConfig = z.infer<typeof CliTargetConfigSchema>;
@@ -1907,7 +1944,6 @@ interface CodexResolvedConfig {
     readonly executable: string;
     readonly args?: readonly string[];
     readonly cwd?: string;
-    readonly workspaceTemplate?: string;
     readonly timeoutMs?: number;
     readonly logDir?: string;
     readonly logFormat?: 'summary' | 'json';
@@ -1920,7 +1956,6 @@ interface CopilotCliResolvedConfig {
     readonly model?: string;
     readonly args?: readonly string[];
     readonly cwd?: string;
-    readonly workspaceTemplate?: string;
     readonly timeoutMs?: number;
     readonly logDir?: string;
     readonly logFormat?: 'summary' | 'json';
@@ -1934,7 +1969,6 @@ interface CopilotSdkResolvedConfig {
     readonly githubToken?: string;
     readonly model?: string;
     readonly cwd?: string;
-    readonly workspaceTemplate?: string;
     readonly timeoutMs?: number;
     readonly logDir?: string;
     readonly logFormat?: 'summary' | 'json';
@@ -1974,7 +2008,6 @@ interface PiCodingAgentResolvedConfig {
     readonly tools?: string;
     readonly thinking?: string;
     readonly cwd?: string;
-    readonly workspaceTemplate?: string;
     readonly timeoutMs?: number;
     readonly logDir?: string;
     readonly logFormat?: 'summary' | 'json';
@@ -1992,7 +2025,6 @@ interface PiCliResolvedConfig {
     readonly thinking?: string;
     readonly args?: readonly string[];
     readonly cwd?: string;
-    readonly workspaceTemplate?: string;
     readonly timeoutMs?: number;
     readonly logDir?: string;
     readonly logFormat?: 'summary' | 'json';
@@ -2001,10 +2033,10 @@ interface PiCliResolvedConfig {
     readonly systemPrompt?: string;
 }
 interface ClaudeResolvedConfig {
+    readonly executable: string;
     readonly model?: string;
     readonly systemPrompt?: string;
     readonly cwd?: string;
-    readonly workspaceTemplate?: string;
     readonly timeoutMs?: number;
     readonly maxTurns?: number;
     readonly maxBudgetUsd?: number;
@@ -2024,7 +2056,6 @@ interface VSCodeResolvedConfig {
     readonly waitForResponse: boolean;
     readonly dryRun: boolean;
     readonly subagentRoot?: string;
-    readonly workspaceTemplate?: string;
     readonly timeoutMs?: number;
 }
 interface AgentVResolvedConfig {
@@ -2335,9 +2366,9 @@ interface EvaluationContext {
     readonly targetResolver?: TargetResolver;
     /** List of available target names for code graders */
     readonly availableTargets?: readonly string[];
-    /** Unified diff of file changes from workspace (when workspace_template is configured) */
+    /** Unified diff of file changes from workspace */
     readonly fileChanges?: string;
-    /** Absolute path to the workspace directory (when workspace_template is configured) */
+    /** Absolute path to the workspace directory */
     readonly workspacePath?: string;
     /** Docker workspace config: when present, code-grader commands run inside a container */
     readonly dockerConfig?: DockerWorkspaceConfig;
@@ -3001,6 +3032,8 @@ interface RunEvalCaseOptions {
     readonly threshold?: number;
     /** Results from dependency tests (only present when the test has depends_on) */
     readonly dependencyResults?: Readonly<Record<string, DependencyResult>>;
+    /** Per-target hooks from eval file (before_all, before_each, after_each, after_all) */
+    readonly targetHooks?: TargetHooksConfig;
 }
 interface ProgressEvent {
     readonly workerId: number;
@@ -3068,6 +3101,8 @@ interface RunEvaluationOptions {
     readonly model?: string;
     /** Per-test score threshold for pass/fail (default: 0.8) */
     readonly threshold?: number;
+    /** Per-target hooks from eval file (before_all, before_each, after_each, after_all) */
+    readonly targetHooks?: TargetHooksConfig;
 }
 declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
 declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
@@ -3951,6 +3986,17 @@ declare function createDraftResultsPr(params: {
     readonly body: string;
 }): Promise<string>;
+/**
+ * The default config directory (~/.agentv). Always resolves to the user's home
+ * directory regardless of AGENTV_HOME. Used for lightweight, machine-local files
+ * like version-check.json, last-config.json, and projects.yaml.
+ */
+declare function getAgentvConfigDir(): string;
+/**
+ * The data root for heavy/large artifacts (workspaces, workspace-pool, subagents,
+ * trace-state, cache, deps). Respects AGENTV_HOME override so users can relocate
+ * bulky data to a different drive. Falls back to ~/.agentv when unset.
+ */
 declare function getAgentvHome(): string;
 declare function getWorkspacesRoot(): string;
 declare function getSubagentsRoot(): string;
@@ -4509,4 +4555,4 @@ type AgentKernel = {
 };
 declare function createAgentKernel(): AgentKernel;
-export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
+export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };

package/dist/index.d.ts CHANGED Viewed

@@ -96,7 +96,7 @@ interface ProviderRequest {
     readonly temperature?: number;
     readonly metadata?: JsonObject;
     readonly signal?: AbortSignal;
-    /** Working directory override (e.g., from workspace_template) */
+    /** Working directory override (e.g., from eval-level workspace.template) */
     readonly cwd?: string;
     /** VS Code .code-workspace file (resolved from workspace.template) */
     readonly workspaceFile?: string;
@@ -265,7 +265,6 @@ interface TargetDefinition {
     readonly wait?: boolean | unknown | undefined;
     readonly dry_run?: boolean | unknown | undefined;
     readonly subagent_root?: string | unknown | undefined;
-    readonly workspace_template?: string | unknown | undefined;
     readonly files_format?: string | unknown | undefined;
     readonly attachments_format?: string | unknown | undefined;
     readonly env?: unknown | undefined;
@@ -630,6 +629,38 @@ type WorkspaceHooksConfig = {
     /** Runs once after final test in the workspace lifecycle */
     readonly after_all?: WorkspaceHookConfig;
 };
+/**
+ * Per-target hook configuration defined in eval files.
+ * Target hooks run setup/teardown scripts to customize the workspace for each target variant.
+ *
+ * Execution order relative to workspace hooks:
+ * - Setup: workspace before_all → target before_all → (per test: workspace before_each → target before_each)
+ * - Teardown: (per test: target after_each → workspace after_each) → target after_all → workspace after_all
+ */
+type TargetHooksConfig = {
+    /** Runs once before first test for this target */
+    readonly before_all?: WorkspaceHookConfig;
+    /** Runs before each test case for this target */
+    readonly before_each?: WorkspaceHookConfig;
+    /** Runs after each test case for this target */
+    readonly after_each?: WorkspaceHookConfig;
+    /** Runs once after final test for this target */
+    readonly after_all?: WorkspaceHookConfig;
+};
+/**
+ * Extended target reference from eval file.
+ * Allows eval files to define per-target hooks and delegation alongside target names.
+ *
+ * String targets are shorthand for `{ name: "target-name" }` (no hooks).
+ */
+type EvalTargetRef = {
+    /** Target name (must match a target in targets.yaml or be defined inline with use_target) */
+    readonly name: string;
+    /** Delegate to another named target (same as use_target in targets.yaml) */
+    readonly use_target?: string;
+    /** Per-target hooks for workspace customization */
+    readonly hooks?: TargetHooksConfig;
+};
 /**
  * Docker-based workspace configuration.
  * When present, code-grader commands run inside a Docker container
@@ -1377,7 +1408,7 @@ interface EvaluationResult {
     readonly afterAllOutput?: string;
     /** Captured output from workspace after_each script */
     readonly afterEachOutput?: string;
-    /** Unified diff of workspace file changes (when workspace_template is configured) */
+    /** Unified diff of workspace file changes */
     readonly fileChanges?: string;
     /** Individual trial results (only present when trials.count > 1) */
     readonly trials?: readonly TrialResult[];
@@ -1499,7 +1530,13 @@ declare function loadConfig(evalFilePath: string, repoRoot: string): Promise<Age
  */
 declare function extractTargetFromSuite(suite: JsonObject): string | undefined;
 /**
- * Extract targets array from parsed eval suite.
+ * Extract target refs from parsed eval suite.
+ * Supports both string shorthand and object form with hooks.
+ * Returns undefined when no targets array is specified.
+ */
+declare function extractTargetRefsFromSuite(suite: JsonObject): readonly EvalTargetRef[] | undefined;
+/**
+ * Extract target names from parsed eval suite (backward-compat wrapper).
  * Precedence: execution.targets (array) > execution.target (singular).
  * Returns undefined when no targets array is specified.
  */
@@ -1584,6 +1621,7 @@ type LoadOptions = {
 declare function readTestSuiteMetadata(testFilePath: string): Promise<{
     target?: string;
     targets?: readonly string[];
+    targetRefs?: readonly EvalTargetRef[];
     trials?: TrialsConfig;
 }>;
 /**
@@ -1595,6 +1633,8 @@ type EvalSuiteResult = {
     readonly trials?: TrialsConfig;
     /** Suite-level targets from execution.targets (matrix evaluation) */
     readonly targets?: readonly string[];
+    /** Suite-level target refs with hooks from execution.targets (object form) */
+    readonly targetRefs?: readonly EvalTargetRef[];
     /** Suite-level workers from execution.workers */
     readonly workers?: number;
     /** Suite-level cache config from execution.cache */
@@ -1765,7 +1805,6 @@ declare const CliTargetConfigSchema: z.ZodObject<{
     command: z.ZodString;
     filesFormat: z.ZodOptional<z.ZodString>;
     cwd: z.ZodOptional<z.ZodString>;
-    workspaceTemplate: z.ZodOptional<z.ZodString>;
     timeoutMs: z.ZodOptional<z.ZodNumber>;
     healthcheck: z.ZodOptional<z.ZodUnion<[z.ZodObject<{
         url: z.ZodString;
@@ -1782,46 +1821,44 @@ declare const CliTargetConfigSchema: z.ZodObject<{
         timeoutMs: z.ZodOptional<z.ZodNumber>;
     }, "strict", z.ZodTypeAny, {
         command: string;
-        cwd?: string | undefined;
         timeoutMs?: number | undefined;
+        cwd?: string | undefined;
     }, {
         command: string;
-        cwd?: string | undefined;
         timeoutMs?: number | undefined;
+        cwd?: string | undefined;
     }>]>>;
     verbose: z.ZodOptional<z.ZodBoolean>;
     keepTempFiles: z.ZodOptional<z.ZodBoolean>;
 }, "strict", z.ZodTypeAny, {
     command: string;
-    verbose?: boolean | undefined;
+    timeoutMs?: number | undefined;
     cwd?: string | undefined;
+    verbose?: boolean | undefined;
     healthcheck?: {
         url: string;
         timeoutMs?: number | undefined;
     } | {
         command: string;
-        cwd?: string | undefined;
         timeoutMs?: number | undefined;
+        cwd?: string | undefined;
     } | undefined;
-    timeoutMs?: number | undefined;
     filesFormat?: string | undefined;
-    workspaceTemplate?: string | undefined;
     keepTempFiles?: boolean | undefined;
 }, {
     command: string;
-    verbose?: boolean | undefined;
+    timeoutMs?: number | undefined;
     cwd?: string | undefined;
+    verbose?: boolean | undefined;
     healthcheck?: {
         url: string;
         timeoutMs?: number | undefined;
     } | {
         command: string;
-        cwd?: string | undefined;
         timeoutMs?: number | undefined;
+        cwd?: string | undefined;
     } | undefined;
-    timeoutMs?: number | undefined;
     filesFormat?: string | undefined;
-    workspaceTemplate?: string | undefined;
     keepTempFiles?: boolean | undefined;
 }>;
 type CliNormalizedConfig = z.infer<typeof CliTargetConfigSchema>;
@@ -1907,7 +1944,6 @@ interface CodexResolvedConfig {
     readonly executable: string;
     readonly args?: readonly string[];
     readonly cwd?: string;
-    readonly workspaceTemplate?: string;
     readonly timeoutMs?: number;
     readonly logDir?: string;
     readonly logFormat?: 'summary' | 'json';
@@ -1920,7 +1956,6 @@ interface CopilotCliResolvedConfig {
     readonly model?: string;
     readonly args?: readonly string[];
     readonly cwd?: string;
-    readonly workspaceTemplate?: string;
     readonly timeoutMs?: number;
     readonly logDir?: string;
     readonly logFormat?: 'summary' | 'json';
@@ -1934,7 +1969,6 @@ interface CopilotSdkResolvedConfig {
     readonly githubToken?: string;
     readonly model?: string;
     readonly cwd?: string;
-    readonly workspaceTemplate?: string;
     readonly timeoutMs?: number;
     readonly logDir?: string;
     readonly logFormat?: 'summary' | 'json';
@@ -1974,7 +2008,6 @@ interface PiCodingAgentResolvedConfig {
     readonly tools?: string;
     readonly thinking?: string;
     readonly cwd?: string;
-    readonly workspaceTemplate?: string;
     readonly timeoutMs?: number;
     readonly logDir?: string;
     readonly logFormat?: 'summary' | 'json';
@@ -1992,7 +2025,6 @@ interface PiCliResolvedConfig {
     readonly thinking?: string;
     readonly args?: readonly string[];
     readonly cwd?: string;
-    readonly workspaceTemplate?: string;
     readonly timeoutMs?: number;
     readonly logDir?: string;
     readonly logFormat?: 'summary' | 'json';
@@ -2001,10 +2033,10 @@ interface PiCliResolvedConfig {
     readonly systemPrompt?: string;
 }
 interface ClaudeResolvedConfig {
+    readonly executable: string;
     readonly model?: string;
     readonly systemPrompt?: string;
     readonly cwd?: string;
-    readonly workspaceTemplate?: string;
     readonly timeoutMs?: number;
     readonly maxTurns?: number;
     readonly maxBudgetUsd?: number;
@@ -2024,7 +2056,6 @@ interface VSCodeResolvedConfig {
     readonly waitForResponse: boolean;
     readonly dryRun: boolean;
     readonly subagentRoot?: string;
-    readonly workspaceTemplate?: string;
     readonly timeoutMs?: number;
 }
 interface AgentVResolvedConfig {
@@ -2335,9 +2366,9 @@ interface EvaluationContext {
     readonly targetResolver?: TargetResolver;
     /** List of available target names for code graders */
     readonly availableTargets?: readonly string[];
-    /** Unified diff of file changes from workspace (when workspace_template is configured) */
+    /** Unified diff of file changes from workspace */
     readonly fileChanges?: string;
-    /** Absolute path to the workspace directory (when workspace_template is configured) */
+    /** Absolute path to the workspace directory */
     readonly workspacePath?: string;
     /** Docker workspace config: when present, code-grader commands run inside a container */
     readonly dockerConfig?: DockerWorkspaceConfig;
@@ -3001,6 +3032,8 @@ interface RunEvalCaseOptions {
     readonly threshold?: number;
     /** Results from dependency tests (only present when the test has depends_on) */
     readonly dependencyResults?: Readonly<Record<string, DependencyResult>>;
+    /** Per-target hooks from eval file (before_all, before_each, after_each, after_all) */
+    readonly targetHooks?: TargetHooksConfig;
 }
 interface ProgressEvent {
     readonly workerId: number;
@@ -3068,6 +3101,8 @@ interface RunEvaluationOptions {
     readonly model?: string;
     /** Per-test score threshold for pass/fail (default: 0.8) */
     readonly threshold?: number;
+    /** Per-target hooks from eval file (before_all, before_each, after_each, after_all) */
+    readonly targetHooks?: TargetHooksConfig;
 }
 declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
 declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
@@ -3951,6 +3986,17 @@ declare function createDraftResultsPr(params: {
     readonly body: string;
 }): Promise<string>;
+/**
+ * The default config directory (~/.agentv). Always resolves to the user's home
+ * directory regardless of AGENTV_HOME. Used for lightweight, machine-local files
+ * like version-check.json, last-config.json, and projects.yaml.
+ */
+declare function getAgentvConfigDir(): string;
+/**
+ * The data root for heavy/large artifacts (workspaces, workspace-pool, subagents,
+ * trace-state, cache, deps). Respects AGENTV_HOME override so users can relocate
+ * bulky data to a different drive. Falls back to ~/.agentv when unset.
+ */
 declare function getAgentvHome(): string;
 declare function getWorkspacesRoot(): string;
 declare function getSubagentsRoot(): string;
@@ -4509,4 +4555,4 @@ type AgentKernel = {
 };
 declare function createAgentKernel(): AgentKernel;
-export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
+export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type BenchmarkEntry, type BenchmarkRegistry, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type ConversationAggregation, type ConversationMode, type ConversationTurn, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DependencyFailurePolicy, type DependencyResult, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTargetRef, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TargetHooksConfig, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptReplayEntry, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type TurnFailurePolicy, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addBenchmark, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveBenchmarkId, deriveCategory, detectFormat, directorySizeBytes, discoverAssertions, discoverBenchmarks, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetRefsFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvConfigDir, getAgentvHome, getBenchmark, getBenchmarksRegistryPath, getOutputFilenames, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, groupTranscriptJsonLines, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadBenchmarkRegistry, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeBenchmark, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveBenchmarkRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLines, tokensPerTool, touchBenchmark, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };