npm - @agentv/core - Versions diffs - 4.10.0 → 4.11.2-next.1 - Mend

@agentv/core 4.10.0 → 4.11.2-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/dist/chunk-3WGHC7LC.js +149 -0
package/dist/chunk-3WGHC7LC.js.map +1 -0
package/dist/{chunk-BWHUWLGW.js → chunk-5POFMJJ7.js} +1 -1
package/dist/chunk-5POFMJJ7.js.map +1 -0
package/dist/chunk-SDIANPEY.js +181 -0
package/dist/chunk-SDIANPEY.js.map +1 -0
package/dist/docker-workspace-RPPXBT27.js +9 -0
package/dist/docker-workspace-RPPXBT27.js.map +1 -0
package/dist/evaluation/validation/index.cjs +70 -3
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +71 -4
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/exec-AR6JUUN5.js +9 -0
package/dist/exec-AR6JUUN5.js.map +1 -0
package/dist/index.cjs +1264 -468
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +191 -5
package/dist/index.d.ts +191 -5
package/dist/index.js +780 -342
package/dist/index.js.map +1 -1
package/package.json +1 -1
package/dist/chunk-BWHUWLGW.js.map +0 -1

package/dist/index.d.cts CHANGED Viewed

@@ -571,6 +571,8 @@ type RepoSource = {
 };
 type RepoCheckout = {
     readonly ref?: string;
+    /** SWE-bench-friendly alias for ref when pinning a dataset snapshot commit */
+    readonly base_commit?: string;
     readonly resolve?: 'remote' | 'local';
     readonly ancestor?: number;
 };
@@ -580,8 +582,10 @@ type RepoClone = {
     readonly sparse?: readonly string[];
 };
 type RepoConfig = {
-    readonly path: string;
-    readonly source: RepoSource;
+    /** Target path inside the workspace. Optional for Docker repos targeting the container's working directory. */
+    readonly path?: string;
+    /** Clone source. Optional for Docker prebuilt images where repos exist inside the container. */
+    readonly source?: RepoSource;
     readonly checkout?: RepoCheckout;
     readonly clone?: RepoClone;
 };
@@ -610,6 +614,21 @@ type WorkspaceHooksConfig = {
     /** Runs once after final test in the workspace lifecycle */
     readonly after_all?: WorkspaceHookConfig;
 };
+/**
+ * Docker-based workspace configuration.
+ * When present, code-grader commands run inside a Docker container
+ * instead of on the host.
+ */
+type DockerWorkspaceConfig = {
+    /** Docker image to use (e.g. 'swebench/sweb.eval.x86_64.django__django-15180') */
+    readonly image: string;
+    /** Container execution timeout in seconds (default: 1800) */
+    readonly timeout?: number;
+    /** Memory limit (e.g. '4g', '512m') */
+    readonly memory?: string;
+    /** CPU limit (e.g. 2, 0.5) */
+    readonly cpus?: number;
+};
 type WorkspaceConfig = {
     /** Template directory or .code-workspace file. Directories are copied to temp workspace.
      *  .code-workspace files are used by VS Code providers; CLI providers use the parent directory. */
@@ -624,6 +643,8 @@ type WorkspaceConfig = {
     readonly mode?: 'pooled' | 'temp' | 'static';
     /** Required when mode=static: use this existing directory directly */
     readonly path?: string;
+    /** Docker-based workspace: run grader commands inside a container */
+    readonly docker?: DockerWorkspaceConfig;
 };
 type CodeEvaluatorConfig = {
     readonly name: string;
@@ -1372,10 +1393,19 @@ type ExecutionDefaults = {
     readonly pool_workspaces?: boolean;
     readonly pool_slots?: number;
 };
+type ResultsExportConfig = {
+    readonly repo: string;
+    readonly path: string;
+    readonly auto_push?: boolean;
+    readonly branch_prefix?: string;
+};
 type AgentVConfig$1 = {
     readonly required_version?: string;
     readonly eval_patterns?: readonly string[];
     readonly execution?: ExecutionDefaults;
+    readonly results?: {
+        readonly export?: ResultsExportConfig;
+    };
 };
 /**
  * Load optional .agentv/config.yaml configuration file.
@@ -2213,6 +2243,8 @@ interface EvaluationContext {
     readonly fileChanges?: string;
     /** Absolute path to the workspace directory (when workspace_template is configured) */
     readonly workspacePath?: string;
+    /** Docker workspace config: when present, code-grader commands run inside a container */
+    readonly dockerConfig?: DockerWorkspaceConfig;
 }
 interface EvaluationScore {
     readonly score: number;
@@ -2813,9 +2845,9 @@ declare class RepoManager {
      * Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
      */
     materialize(repo: RepoConfig, workspacePath: string): Promise<void>;
-    /** Materialize all repos into the workspace. */
+    /** Materialize all repos into the workspace. Skips repos without source (Docker-only repos). */
     materializeAll(repos: readonly RepoConfig[], workspacePath: string): Promise<void>;
-    /** Reset repos in workspace to their checkout state. */
+    /** Reset repos in workspace to their checkout state. Skips repos without path or source. */
     reset(repos: readonly RepoConfig[], workspacePath: string, reset: 'fast' | 'strict'): Promise<void>;
 }
@@ -3606,6 +3638,106 @@ interface DepsScanResult {
  */
 declare function scanRepoDeps(evalFilePaths: readonly string[]): Promise<DepsScanResult>;
+interface RepoCheckoutTarget {
+    readonly path?: string;
+    readonly ref: string;
+}
+/**
+ * Docker workspace provider — manages Docker container lifecycle for eval grading.
+ *
+ * Flow: pull image → create container → copy files in → exec grader → parse output → destroy container.
+ * All Docker commands use `execFile` (no shell) for security.
+ *
+ * To add a new Docker command: add a method that calls `this.exec(...)` with the appropriate argv.
+ *
+ * Design decisions:
+ * - CommandExecutor interface for testability (mock `execFile` in tests)
+ * - Always `docker rm -f` in cleanup, even on errors (try/finally)
+ * - Lazy-loaded: non-Docker evals never import this module
+ */
+/** Result of a command execution */
+interface ExecResult {
+    readonly stdout: string;
+    readonly stderr: string;
+    readonly exitCode: number;
+}
+/** Abstraction over process execution for testability */
+interface CommandExecutor {
+    exec(argv: readonly string[], options?: {
+        timeoutMs?: number;
+        stdin?: string;
+    }): Promise<ExecResult>;
+}
+/** Options for creating a Docker container */
+interface CreateContainerOptions {
+    readonly image: string;
+    readonly memory?: string;
+    readonly cpus?: number;
+}
+/** Options for executing a command inside a container */
+interface ExecInContainerOptions {
+    readonly containerId: string;
+    readonly command: readonly string[];
+    readonly timeoutMs?: number;
+    readonly stdin?: string;
+}
+/**
+ * Manages Docker container lifecycle for workspace-based evaluations.
+ *
+ * Usage:
+ *   const docker = new DockerWorkspaceProvider(config);
+ *   await docker.pullImage();
+ *   const containerId = await docker.createContainer();
+ *   try {
+ *     await docker.copyToContainer(containerId, localPath, containerPath);
+ *     const output = await docker.execInContainer({ containerId, command: [...] });
+ *     // parse output...
+ *   } finally {
+ *     await docker.removeContainer(containerId);
+ *   }
+ */
+declare class DockerWorkspaceProvider {
+    private readonly config;
+    private readonly executor;
+    private readonly timeoutMs;
+    constructor(config: DockerWorkspaceConfig, executor?: CommandExecutor);
+    /** Check whether the Docker CLI is available on the host. */
+    isDockerAvailable(): Promise<boolean>;
+    /** Pull the configured Docker image. No-op if already cached locally. */
+    pullImage(): Promise<void>;
+    /** Create a stopped container from the configured image with resource limits. Returns container ID. */
+    createContainer(): Promise<string>;
+    /** Start a previously created container. */
+    startContainer(containerId: string): Promise<void>;
+    /**
+     * Reset the container checkout to the specified target refs, if any.
+     * This is used for SWE-bench images where the repo state must match the
+     * dataset's base snapshot before grading begins.
+     */
+    resetContainerCheckout(containerId: string, repoCheckouts?: readonly RepoCheckoutTarget[]): Promise<void>;
+    /** Copy a local file or directory into a running container. */
+    copyToContainer(containerId: string, localPath: string, containerPath: string): Promise<void>;
+    /**
+     * Execute a command inside a running container.
+     * If stdin is provided, it is piped via `docker exec -i`.
+     */
+    execInContainer(options: ExecInContainerOptions): Promise<ExecResult>;
+    /** Force-remove a container (always succeeds, even if container doesn't exist). */
+    removeContainer(containerId: string): Promise<void>;
+    /** Full lifecycle: create → start → exec → cleanup. Convenience for single-command grading. */
+    runGraderInContainer(options: {
+        readonly command: readonly string[];
+        readonly stdin?: string;
+        readonly copyFiles?: ReadonlyArray<{
+            localPath: string;
+            containerPath: string;
+        }>;
+        readonly repoCheckouts?: readonly RepoCheckoutTarget[];
+    }): Promise<ExecResult>;
+}
 /**
  * File-based LLM response cache.
  * Stores provider responses as JSON files keyed by SHA-256 hash.
@@ -3662,6 +3794,60 @@ declare function toSnakeCaseDeep(obj: unknown): unknown;
  */
 declare function toCamelCaseDeep(obj: unknown): unknown;
+interface ResultsRepoCachePaths {
+    readonly rootDir: string;
+    readonly repoDir: string;
+    readonly statusFile: string;
+}
+interface ResultsRepoStatus {
+    readonly configured: boolean;
+    readonly available: boolean;
+    readonly repo?: string;
+    readonly path?: string;
+    readonly auto_push?: boolean;
+    readonly branch_prefix?: string;
+    readonly cache_dir?: string;
+    readonly last_synced_at?: string;
+    readonly last_error?: string;
+}
+interface CheckedOutResultsRepoBranch {
+    readonly branchName: string;
+    readonly baseBranch: string;
+    readonly repoDir: string;
+}
+interface PreparedResultsRepoBranch extends CheckedOutResultsRepoBranch {
+    readonly cleanup: () => Promise<void>;
+}
+declare function normalizeResultsExportConfig(config: ResultsExportConfig): Required<ResultsExportConfig>;
+declare function resolveResultsRepoUrl(repo: string): string;
+declare function getResultsRepoCachePaths(repo: string): ResultsRepoCachePaths;
+declare function ensureResultsRepoClone(config: ResultsExportConfig): Promise<string>;
+declare function getResultsRepoStatus(config?: ResultsExportConfig): ResultsRepoStatus;
+declare function syncResultsRepo(config: ResultsExportConfig): Promise<ResultsRepoStatus>;
+declare function checkoutResultsRepoBranch(config: ResultsExportConfig, branchName: string): Promise<CheckedOutResultsRepoBranch>;
+declare function prepareResultsRepoBranch(config: ResultsExportConfig, branchName: string): Promise<PreparedResultsRepoBranch>;
+declare function stageResultsArtifacts(params: {
+    readonly repoDir: string;
+    readonly sourceDir: string;
+    readonly destinationDir: string;
+}): Promise<void>;
+declare function resolveResultsRepoRunsDir(config: ResultsExportConfig): string;
+declare function directorySizeBytes(targetPath: string): Promise<number>;
+declare function commitAndPushResultsBranch(params: {
+    readonly repoDir: string;
+    readonly branchName: string;
+    readonly commitMessage: string;
+}): Promise<boolean>;
+declare function pushResultsRepoBranch(config: ResultsExportConfig, branchName: string, cwd?: string): Promise<void>;
+declare function createDraftResultsPr(params: {
+    readonly repo: string;
+    readonly repoDir: string;
+    readonly baseBranch: string;
+    readonly branchName: string;
+    readonly title: string;
+    readonly body: string;
+}): Promise<string>;
 declare function getAgentvHome(): string;
 declare function getWorkspacesRoot(): string;
 declare function getSubagentsRoot(): string;
@@ -4186,4 +4372,4 @@ type AgentKernel = {
 };
 declare function createAgentKernel(): AgentKernel;
-export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DepsScanResult, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
+export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, directorySizeBytes, discoverAssertions, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };

package/dist/index.d.ts CHANGED Viewed

@@ -571,6 +571,8 @@ type RepoSource = {
 };
 type RepoCheckout = {
     readonly ref?: string;
+    /** SWE-bench-friendly alias for ref when pinning a dataset snapshot commit */
+    readonly base_commit?: string;
     readonly resolve?: 'remote' | 'local';
     readonly ancestor?: number;
 };
@@ -580,8 +582,10 @@ type RepoClone = {
     readonly sparse?: readonly string[];
 };
 type RepoConfig = {
-    readonly path: string;
-    readonly source: RepoSource;
+    /** Target path inside the workspace. Optional for Docker repos targeting the container's working directory. */
+    readonly path?: string;
+    /** Clone source. Optional for Docker prebuilt images where repos exist inside the container. */
+    readonly source?: RepoSource;
     readonly checkout?: RepoCheckout;
     readonly clone?: RepoClone;
 };
@@ -610,6 +614,21 @@ type WorkspaceHooksConfig = {
     /** Runs once after final test in the workspace lifecycle */
     readonly after_all?: WorkspaceHookConfig;
 };
+/**
+ * Docker-based workspace configuration.
+ * When present, code-grader commands run inside a Docker container
+ * instead of on the host.
+ */
+type DockerWorkspaceConfig = {
+    /** Docker image to use (e.g. 'swebench/sweb.eval.x86_64.django__django-15180') */
+    readonly image: string;
+    /** Container execution timeout in seconds (default: 1800) */
+    readonly timeout?: number;
+    /** Memory limit (e.g. '4g', '512m') */
+    readonly memory?: string;
+    /** CPU limit (e.g. 2, 0.5) */
+    readonly cpus?: number;
+};
 type WorkspaceConfig = {
     /** Template directory or .code-workspace file. Directories are copied to temp workspace.
      *  .code-workspace files are used by VS Code providers; CLI providers use the parent directory. */
@@ -624,6 +643,8 @@ type WorkspaceConfig = {
     readonly mode?: 'pooled' | 'temp' | 'static';
     /** Required when mode=static: use this existing directory directly */
     readonly path?: string;
+    /** Docker-based workspace: run grader commands inside a container */
+    readonly docker?: DockerWorkspaceConfig;
 };
 type CodeEvaluatorConfig = {
     readonly name: string;
@@ -1372,10 +1393,19 @@ type ExecutionDefaults = {
     readonly pool_workspaces?: boolean;
     readonly pool_slots?: number;
 };
+type ResultsExportConfig = {
+    readonly repo: string;
+    readonly path: string;
+    readonly auto_push?: boolean;
+    readonly branch_prefix?: string;
+};
 type AgentVConfig$1 = {
     readonly required_version?: string;
     readonly eval_patterns?: readonly string[];
     readonly execution?: ExecutionDefaults;
+    readonly results?: {
+        readonly export?: ResultsExportConfig;
+    };
 };
 /**
  * Load optional .agentv/config.yaml configuration file.
@@ -2213,6 +2243,8 @@ interface EvaluationContext {
     readonly fileChanges?: string;
     /** Absolute path to the workspace directory (when workspace_template is configured) */
     readonly workspacePath?: string;
+    /** Docker workspace config: when present, code-grader commands run inside a container */
+    readonly dockerConfig?: DockerWorkspaceConfig;
 }
 interface EvaluationScore {
     readonly score: number;
@@ -2813,9 +2845,9 @@ declare class RepoManager {
      * Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
      */
     materialize(repo: RepoConfig, workspacePath: string): Promise<void>;
-    /** Materialize all repos into the workspace. */
+    /** Materialize all repos into the workspace. Skips repos without source (Docker-only repos). */
     materializeAll(repos: readonly RepoConfig[], workspacePath: string): Promise<void>;
-    /** Reset repos in workspace to their checkout state. */
+    /** Reset repos in workspace to their checkout state. Skips repos without path or source. */
     reset(repos: readonly RepoConfig[], workspacePath: string, reset: 'fast' | 'strict'): Promise<void>;
 }
@@ -3606,6 +3638,106 @@ interface DepsScanResult {
  */
 declare function scanRepoDeps(evalFilePaths: readonly string[]): Promise<DepsScanResult>;
+interface RepoCheckoutTarget {
+    readonly path?: string;
+    readonly ref: string;
+}
+/**
+ * Docker workspace provider — manages Docker container lifecycle for eval grading.
+ *
+ * Flow: pull image → create container → copy files in → exec grader → parse output → destroy container.
+ * All Docker commands use `execFile` (no shell) for security.
+ *
+ * To add a new Docker command: add a method that calls `this.exec(...)` with the appropriate argv.
+ *
+ * Design decisions:
+ * - CommandExecutor interface for testability (mock `execFile` in tests)
+ * - Always `docker rm -f` in cleanup, even on errors (try/finally)
+ * - Lazy-loaded: non-Docker evals never import this module
+ */
+/** Result of a command execution */
+interface ExecResult {
+    readonly stdout: string;
+    readonly stderr: string;
+    readonly exitCode: number;
+}
+/** Abstraction over process execution for testability */
+interface CommandExecutor {
+    exec(argv: readonly string[], options?: {
+        timeoutMs?: number;
+        stdin?: string;
+    }): Promise<ExecResult>;
+}
+/** Options for creating a Docker container */
+interface CreateContainerOptions {
+    readonly image: string;
+    readonly memory?: string;
+    readonly cpus?: number;
+}
+/** Options for executing a command inside a container */
+interface ExecInContainerOptions {
+    readonly containerId: string;
+    readonly command: readonly string[];
+    readonly timeoutMs?: number;
+    readonly stdin?: string;
+}
+/**
+ * Manages Docker container lifecycle for workspace-based evaluations.
+ *
+ * Usage:
+ *   const docker = new DockerWorkspaceProvider(config);
+ *   await docker.pullImage();
+ *   const containerId = await docker.createContainer();
+ *   try {
+ *     await docker.copyToContainer(containerId, localPath, containerPath);
+ *     const output = await docker.execInContainer({ containerId, command: [...] });
+ *     // parse output...
+ *   } finally {
+ *     await docker.removeContainer(containerId);
+ *   }
+ */
+declare class DockerWorkspaceProvider {
+    private readonly config;
+    private readonly executor;
+    private readonly timeoutMs;
+    constructor(config: DockerWorkspaceConfig, executor?: CommandExecutor);
+    /** Check whether the Docker CLI is available on the host. */
+    isDockerAvailable(): Promise<boolean>;
+    /** Pull the configured Docker image. No-op if already cached locally. */
+    pullImage(): Promise<void>;
+    /** Create a stopped container from the configured image with resource limits. Returns container ID. */
+    createContainer(): Promise<string>;
+    /** Start a previously created container. */
+    startContainer(containerId: string): Promise<void>;
+    /**
+     * Reset the container checkout to the specified target refs, if any.
+     * This is used for SWE-bench images where the repo state must match the
+     * dataset's base snapshot before grading begins.
+     */
+    resetContainerCheckout(containerId: string, repoCheckouts?: readonly RepoCheckoutTarget[]): Promise<void>;
+    /** Copy a local file or directory into a running container. */
+    copyToContainer(containerId: string, localPath: string, containerPath: string): Promise<void>;
+    /**
+     * Execute a command inside a running container.
+     * If stdin is provided, it is piped via `docker exec -i`.
+     */
+    execInContainer(options: ExecInContainerOptions): Promise<ExecResult>;
+    /** Force-remove a container (always succeeds, even if container doesn't exist). */
+    removeContainer(containerId: string): Promise<void>;
+    /** Full lifecycle: create → start → exec → cleanup. Convenience for single-command grading. */
+    runGraderInContainer(options: {
+        readonly command: readonly string[];
+        readonly stdin?: string;
+        readonly copyFiles?: ReadonlyArray<{
+            localPath: string;
+            containerPath: string;
+        }>;
+        readonly repoCheckouts?: readonly RepoCheckoutTarget[];
+    }): Promise<ExecResult>;
+}
 /**
  * File-based LLM response cache.
  * Stores provider responses as JSON files keyed by SHA-256 hash.
@@ -3662,6 +3794,60 @@ declare function toSnakeCaseDeep(obj: unknown): unknown;
  */
 declare function toCamelCaseDeep(obj: unknown): unknown;
+interface ResultsRepoCachePaths {
+    readonly rootDir: string;
+    readonly repoDir: string;
+    readonly statusFile: string;
+}
+interface ResultsRepoStatus {
+    readonly configured: boolean;
+    readonly available: boolean;
+    readonly repo?: string;
+    readonly path?: string;
+    readonly auto_push?: boolean;
+    readonly branch_prefix?: string;
+    readonly cache_dir?: string;
+    readonly last_synced_at?: string;
+    readonly last_error?: string;
+}
+interface CheckedOutResultsRepoBranch {
+    readonly branchName: string;
+    readonly baseBranch: string;
+    readonly repoDir: string;
+}
+interface PreparedResultsRepoBranch extends CheckedOutResultsRepoBranch {
+    readonly cleanup: () => Promise<void>;
+}
+declare function normalizeResultsExportConfig(config: ResultsExportConfig): Required<ResultsExportConfig>;
+declare function resolveResultsRepoUrl(repo: string): string;
+declare function getResultsRepoCachePaths(repo: string): ResultsRepoCachePaths;
+declare function ensureResultsRepoClone(config: ResultsExportConfig): Promise<string>;
+declare function getResultsRepoStatus(config?: ResultsExportConfig): ResultsRepoStatus;
+declare function syncResultsRepo(config: ResultsExportConfig): Promise<ResultsRepoStatus>;
+declare function checkoutResultsRepoBranch(config: ResultsExportConfig, branchName: string): Promise<CheckedOutResultsRepoBranch>;
+declare function prepareResultsRepoBranch(config: ResultsExportConfig, branchName: string): Promise<PreparedResultsRepoBranch>;
+declare function stageResultsArtifacts(params: {
+    readonly repoDir: string;
+    readonly sourceDir: string;
+    readonly destinationDir: string;
+}): Promise<void>;
+declare function resolveResultsRepoRunsDir(config: ResultsExportConfig): string;
+declare function directorySizeBytes(targetPath: string): Promise<number>;
+declare function commitAndPushResultsBranch(params: {
+    readonly repoDir: string;
+    readonly branchName: string;
+    readonly commitMessage: string;
+}): Promise<boolean>;
+declare function pushResultsRepoBranch(config: ResultsExportConfig, branchName: string, cwd?: string): Promise<void>;
+declare function createDraftResultsPr(params: {
+    readonly repo: string;
+    readonly repoDir: string;
+    readonly baseBranch: string;
+    readonly branchName: string;
+    readonly title: string;
+    readonly body: string;
+}): Promise<string>;
 declare function getAgentvHome(): string;
 declare function getWorkspacesRoot(): string;
 declare function getSubagentsRoot(): string;
@@ -4186,4 +4372,4 @@ type AgentKernel = {
 };
 declare function createAgentKernel(): AgentKernel;
-export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DepsScanResult, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
+export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, directorySizeBytes, discoverAssertions, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };