@agentv/core 4.10.0 → 4.11.2-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -571,6 +571,8 @@ type RepoSource = {
571
571
  };
572
572
  type RepoCheckout = {
573
573
  readonly ref?: string;
574
+ /** SWE-bench-friendly alias for ref when pinning a dataset snapshot commit */
575
+ readonly base_commit?: string;
574
576
  readonly resolve?: 'remote' | 'local';
575
577
  readonly ancestor?: number;
576
578
  };
@@ -580,8 +582,10 @@ type RepoClone = {
580
582
  readonly sparse?: readonly string[];
581
583
  };
582
584
  type RepoConfig = {
583
- readonly path: string;
584
- readonly source: RepoSource;
585
+ /** Target path inside the workspace. Optional for Docker repos targeting the container's working directory. */
586
+ readonly path?: string;
587
+ /** Clone source. Optional for Docker prebuilt images where repos exist inside the container. */
588
+ readonly source?: RepoSource;
585
589
  readonly checkout?: RepoCheckout;
586
590
  readonly clone?: RepoClone;
587
591
  };
@@ -610,6 +614,21 @@ type WorkspaceHooksConfig = {
610
614
  /** Runs once after final test in the workspace lifecycle */
611
615
  readonly after_all?: WorkspaceHookConfig;
612
616
  };
617
+ /**
618
+ * Docker-based workspace configuration.
619
+ * When present, code-grader commands run inside a Docker container
620
+ * instead of on the host.
621
+ */
622
+ type DockerWorkspaceConfig = {
623
+ /** Docker image to use (e.g. 'swebench/sweb.eval.x86_64.django__django-15180') */
624
+ readonly image: string;
625
+ /** Container execution timeout in seconds (default: 1800) */
626
+ readonly timeout?: number;
627
+ /** Memory limit (e.g. '4g', '512m') */
628
+ readonly memory?: string;
629
+ /** CPU limit (e.g. 2, 0.5) */
630
+ readonly cpus?: number;
631
+ };
613
632
  type WorkspaceConfig = {
614
633
  /** Template directory or .code-workspace file. Directories are copied to temp workspace.
615
634
  * .code-workspace files are used by VS Code providers; CLI providers use the parent directory. */
@@ -624,6 +643,8 @@ type WorkspaceConfig = {
624
643
  readonly mode?: 'pooled' | 'temp' | 'static';
625
644
  /** Required when mode=static: use this existing directory directly */
626
645
  readonly path?: string;
646
+ /** Docker-based workspace: run grader commands inside a container */
647
+ readonly docker?: DockerWorkspaceConfig;
627
648
  };
628
649
  type CodeEvaluatorConfig = {
629
650
  readonly name: string;
@@ -1372,10 +1393,19 @@ type ExecutionDefaults = {
1372
1393
  readonly pool_workspaces?: boolean;
1373
1394
  readonly pool_slots?: number;
1374
1395
  };
1396
+ type ResultsExportConfig = {
1397
+ readonly repo: string;
1398
+ readonly path: string;
1399
+ readonly auto_push?: boolean;
1400
+ readonly branch_prefix?: string;
1401
+ };
1375
1402
  type AgentVConfig$1 = {
1376
1403
  readonly required_version?: string;
1377
1404
  readonly eval_patterns?: readonly string[];
1378
1405
  readonly execution?: ExecutionDefaults;
1406
+ readonly results?: {
1407
+ readonly export?: ResultsExportConfig;
1408
+ };
1379
1409
  };
1380
1410
  /**
1381
1411
  * Load optional .agentv/config.yaml configuration file.
@@ -2213,6 +2243,8 @@ interface EvaluationContext {
2213
2243
  readonly fileChanges?: string;
2214
2244
  /** Absolute path to the workspace directory (when workspace_template is configured) */
2215
2245
  readonly workspacePath?: string;
2246
+ /** Docker workspace config: when present, code-grader commands run inside a container */
2247
+ readonly dockerConfig?: DockerWorkspaceConfig;
2216
2248
  }
2217
2249
  interface EvaluationScore {
2218
2250
  readonly score: number;
@@ -2813,9 +2845,9 @@ declare class RepoManager {
2813
2845
  * Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
2814
2846
  */
2815
2847
  materialize(repo: RepoConfig, workspacePath: string): Promise<void>;
2816
- /** Materialize all repos into the workspace. */
2848
+ /** Materialize all repos into the workspace. Skips repos without source (Docker-only repos). */
2817
2849
  materializeAll(repos: readonly RepoConfig[], workspacePath: string): Promise<void>;
2818
- /** Reset repos in workspace to their checkout state. */
2850
+ /** Reset repos in workspace to their checkout state. Skips repos without path or source. */
2819
2851
  reset(repos: readonly RepoConfig[], workspacePath: string, reset: 'fast' | 'strict'): Promise<void>;
2820
2852
  }
2821
2853
 
@@ -3606,6 +3638,106 @@ interface DepsScanResult {
3606
3638
  */
3607
3639
  declare function scanRepoDeps(evalFilePaths: readonly string[]): Promise<DepsScanResult>;
3608
3640
 
3641
+ interface RepoCheckoutTarget {
3642
+ readonly path?: string;
3643
+ readonly ref: string;
3644
+ }
3645
+
3646
+ /**
3647
+ * Docker workspace provider — manages Docker container lifecycle for eval grading.
3648
+ *
3649
+ * Flow: pull image → create container → copy files in → exec grader → parse output → destroy container.
3650
+ * All Docker commands use `execFile` (no shell) for security.
3651
+ *
3652
+ * To add a new Docker command: add a method that calls `this.exec(...)` with the appropriate argv.
3653
+ *
3654
+ * Design decisions:
3655
+ * - CommandExecutor interface for testability (mock `execFile` in tests)
3656
+ * - Always `docker rm -f` in cleanup, even on errors (try/finally)
3657
+ * - Lazy-loaded: non-Docker evals never import this module
3658
+ */
3659
+
3660
+ /** Result of a command execution */
3661
+ interface ExecResult {
3662
+ readonly stdout: string;
3663
+ readonly stderr: string;
3664
+ readonly exitCode: number;
3665
+ }
3666
+ /** Abstraction over process execution for testability */
3667
+ interface CommandExecutor {
3668
+ exec(argv: readonly string[], options?: {
3669
+ timeoutMs?: number;
3670
+ stdin?: string;
3671
+ }): Promise<ExecResult>;
3672
+ }
3673
+ /** Options for creating a Docker container */
3674
+ interface CreateContainerOptions {
3675
+ readonly image: string;
3676
+ readonly memory?: string;
3677
+ readonly cpus?: number;
3678
+ }
3679
+ /** Options for executing a command inside a container */
3680
+ interface ExecInContainerOptions {
3681
+ readonly containerId: string;
3682
+ readonly command: readonly string[];
3683
+ readonly timeoutMs?: number;
3684
+ readonly stdin?: string;
3685
+ }
3686
+ /**
3687
+ * Manages Docker container lifecycle for workspace-based evaluations.
3688
+ *
3689
+ * Usage:
3690
+ * const docker = new DockerWorkspaceProvider(config);
3691
+ * await docker.pullImage();
3692
+ * const containerId = await docker.createContainer();
3693
+ * try {
3694
+ * await docker.copyToContainer(containerId, localPath, containerPath);
3695
+ * const output = await docker.execInContainer({ containerId, command: [...] });
3696
+ * // parse output...
3697
+ * } finally {
3698
+ * await docker.removeContainer(containerId);
3699
+ * }
3700
+ */
3701
+ declare class DockerWorkspaceProvider {
3702
+ private readonly config;
3703
+ private readonly executor;
3704
+ private readonly timeoutMs;
3705
+ constructor(config: DockerWorkspaceConfig, executor?: CommandExecutor);
3706
+ /** Check whether the Docker CLI is available on the host. */
3707
+ isDockerAvailable(): Promise<boolean>;
3708
+ /** Pull the configured Docker image. No-op if already cached locally. */
3709
+ pullImage(): Promise<void>;
3710
+ /** Create a stopped container from the configured image with resource limits. Returns container ID. */
3711
+ createContainer(): Promise<string>;
3712
+ /** Start a previously created container. */
3713
+ startContainer(containerId: string): Promise<void>;
3714
+ /**
3715
+ * Reset the container checkout to the specified target refs, if any.
3716
+ * This is used for SWE-bench images where the repo state must match the
3717
+ * dataset's base snapshot before grading begins.
3718
+ */
3719
+ resetContainerCheckout(containerId: string, repoCheckouts?: readonly RepoCheckoutTarget[]): Promise<void>;
3720
+ /** Copy a local file or directory into a running container. */
3721
+ copyToContainer(containerId: string, localPath: string, containerPath: string): Promise<void>;
3722
+ /**
3723
+ * Execute a command inside a running container.
3724
+ * If stdin is provided, it is piped via `docker exec -i`.
3725
+ */
3726
+ execInContainer(options: ExecInContainerOptions): Promise<ExecResult>;
3727
+ /** Force-remove a container (always succeeds, even if container doesn't exist). */
3728
+ removeContainer(containerId: string): Promise<void>;
3729
+ /** Full lifecycle: create → start → exec → cleanup. Convenience for single-command grading. */
3730
+ runGraderInContainer(options: {
3731
+ readonly command: readonly string[];
3732
+ readonly stdin?: string;
3733
+ readonly copyFiles?: ReadonlyArray<{
3734
+ localPath: string;
3735
+ containerPath: string;
3736
+ }>;
3737
+ readonly repoCheckouts?: readonly RepoCheckoutTarget[];
3738
+ }): Promise<ExecResult>;
3739
+ }
3740
+
3609
3741
  /**
3610
3742
  * File-based LLM response cache.
3611
3743
  * Stores provider responses as JSON files keyed by SHA-256 hash.
@@ -3662,6 +3794,60 @@ declare function toSnakeCaseDeep(obj: unknown): unknown;
3662
3794
  */
3663
3795
  declare function toCamelCaseDeep(obj: unknown): unknown;
3664
3796
 
3797
+ interface ResultsRepoCachePaths {
3798
+ readonly rootDir: string;
3799
+ readonly repoDir: string;
3800
+ readonly statusFile: string;
3801
+ }
3802
+ interface ResultsRepoStatus {
3803
+ readonly configured: boolean;
3804
+ readonly available: boolean;
3805
+ readonly repo?: string;
3806
+ readonly path?: string;
3807
+ readonly auto_push?: boolean;
3808
+ readonly branch_prefix?: string;
3809
+ readonly cache_dir?: string;
3810
+ readonly last_synced_at?: string;
3811
+ readonly last_error?: string;
3812
+ }
3813
+ interface CheckedOutResultsRepoBranch {
3814
+ readonly branchName: string;
3815
+ readonly baseBranch: string;
3816
+ readonly repoDir: string;
3817
+ }
3818
+ interface PreparedResultsRepoBranch extends CheckedOutResultsRepoBranch {
3819
+ readonly cleanup: () => Promise<void>;
3820
+ }
3821
+ declare function normalizeResultsExportConfig(config: ResultsExportConfig): Required<ResultsExportConfig>;
3822
+ declare function resolveResultsRepoUrl(repo: string): string;
3823
+ declare function getResultsRepoCachePaths(repo: string): ResultsRepoCachePaths;
3824
+ declare function ensureResultsRepoClone(config: ResultsExportConfig): Promise<string>;
3825
+ declare function getResultsRepoStatus(config?: ResultsExportConfig): ResultsRepoStatus;
3826
+ declare function syncResultsRepo(config: ResultsExportConfig): Promise<ResultsRepoStatus>;
3827
+ declare function checkoutResultsRepoBranch(config: ResultsExportConfig, branchName: string): Promise<CheckedOutResultsRepoBranch>;
3828
+ declare function prepareResultsRepoBranch(config: ResultsExportConfig, branchName: string): Promise<PreparedResultsRepoBranch>;
3829
+ declare function stageResultsArtifacts(params: {
3830
+ readonly repoDir: string;
3831
+ readonly sourceDir: string;
3832
+ readonly destinationDir: string;
3833
+ }): Promise<void>;
3834
+ declare function resolveResultsRepoRunsDir(config: ResultsExportConfig): string;
3835
+ declare function directorySizeBytes(targetPath: string): Promise<number>;
3836
+ declare function commitAndPushResultsBranch(params: {
3837
+ readonly repoDir: string;
3838
+ readonly branchName: string;
3839
+ readonly commitMessage: string;
3840
+ }): Promise<boolean>;
3841
+ declare function pushResultsRepoBranch(config: ResultsExportConfig, branchName: string, cwd?: string): Promise<void>;
3842
+ declare function createDraftResultsPr(params: {
3843
+ readonly repo: string;
3844
+ readonly repoDir: string;
3845
+ readonly baseBranch: string;
3846
+ readonly branchName: string;
3847
+ readonly title: string;
3848
+ readonly body: string;
3849
+ }): Promise<string>;
3850
+
3665
3851
  declare function getAgentvHome(): string;
3666
3852
  declare function getWorkspacesRoot(): string;
3667
3853
  declare function getSubagentsRoot(): string;
@@ -4186,4 +4372,4 @@ type AgentKernel = {
4186
4372
  };
4187
4373
  declare function createAgentKernel(): AgentKernel;
4188
4374
 
4189
- export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DepsScanResult, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
4375
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, directorySizeBytes, discoverAssertions, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
package/dist/index.d.ts CHANGED
@@ -571,6 +571,8 @@ type RepoSource = {
571
571
  };
572
572
  type RepoCheckout = {
573
573
  readonly ref?: string;
574
+ /** SWE-bench-friendly alias for ref when pinning a dataset snapshot commit */
575
+ readonly base_commit?: string;
574
576
  readonly resolve?: 'remote' | 'local';
575
577
  readonly ancestor?: number;
576
578
  };
@@ -580,8 +582,10 @@ type RepoClone = {
580
582
  readonly sparse?: readonly string[];
581
583
  };
582
584
  type RepoConfig = {
583
- readonly path: string;
584
- readonly source: RepoSource;
585
+ /** Target path inside the workspace. Optional for Docker repos targeting the container's working directory. */
586
+ readonly path?: string;
587
+ /** Clone source. Optional for Docker prebuilt images where repos exist inside the container. */
588
+ readonly source?: RepoSource;
585
589
  readonly checkout?: RepoCheckout;
586
590
  readonly clone?: RepoClone;
587
591
  };
@@ -610,6 +614,21 @@ type WorkspaceHooksConfig = {
610
614
  /** Runs once after final test in the workspace lifecycle */
611
615
  readonly after_all?: WorkspaceHookConfig;
612
616
  };
617
+ /**
618
+ * Docker-based workspace configuration.
619
+ * When present, code-grader commands run inside a Docker container
620
+ * instead of on the host.
621
+ */
622
+ type DockerWorkspaceConfig = {
623
+ /** Docker image to use (e.g. 'swebench/sweb.eval.x86_64.django__django-15180') */
624
+ readonly image: string;
625
+ /** Container execution timeout in seconds (default: 1800) */
626
+ readonly timeout?: number;
627
+ /** Memory limit (e.g. '4g', '512m') */
628
+ readonly memory?: string;
629
+ /** CPU limit (e.g. 2, 0.5) */
630
+ readonly cpus?: number;
631
+ };
613
632
  type WorkspaceConfig = {
614
633
  /** Template directory or .code-workspace file. Directories are copied to temp workspace.
615
634
  * .code-workspace files are used by VS Code providers; CLI providers use the parent directory. */
@@ -624,6 +643,8 @@ type WorkspaceConfig = {
624
643
  readonly mode?: 'pooled' | 'temp' | 'static';
625
644
  /** Required when mode=static: use this existing directory directly */
626
645
  readonly path?: string;
646
+ /** Docker-based workspace: run grader commands inside a container */
647
+ readonly docker?: DockerWorkspaceConfig;
627
648
  };
628
649
  type CodeEvaluatorConfig = {
629
650
  readonly name: string;
@@ -1372,10 +1393,19 @@ type ExecutionDefaults = {
1372
1393
  readonly pool_workspaces?: boolean;
1373
1394
  readonly pool_slots?: number;
1374
1395
  };
1396
+ type ResultsExportConfig = {
1397
+ readonly repo: string;
1398
+ readonly path: string;
1399
+ readonly auto_push?: boolean;
1400
+ readonly branch_prefix?: string;
1401
+ };
1375
1402
  type AgentVConfig$1 = {
1376
1403
  readonly required_version?: string;
1377
1404
  readonly eval_patterns?: readonly string[];
1378
1405
  readonly execution?: ExecutionDefaults;
1406
+ readonly results?: {
1407
+ readonly export?: ResultsExportConfig;
1408
+ };
1379
1409
  };
1380
1410
  /**
1381
1411
  * Load optional .agentv/config.yaml configuration file.
@@ -2213,6 +2243,8 @@ interface EvaluationContext {
2213
2243
  readonly fileChanges?: string;
2214
2244
  /** Absolute path to the workspace directory (when workspace_template is configured) */
2215
2245
  readonly workspacePath?: string;
2246
+ /** Docker workspace config: when present, code-grader commands run inside a container */
2247
+ readonly dockerConfig?: DockerWorkspaceConfig;
2216
2248
  }
2217
2249
  interface EvaluationScore {
2218
2250
  readonly score: number;
@@ -2813,9 +2845,9 @@ declare class RepoManager {
2813
2845
  * Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
2814
2846
  */
2815
2847
  materialize(repo: RepoConfig, workspacePath: string): Promise<void>;
2816
- /** Materialize all repos into the workspace. */
2848
+ /** Materialize all repos into the workspace. Skips repos without source (Docker-only repos). */
2817
2849
  materializeAll(repos: readonly RepoConfig[], workspacePath: string): Promise<void>;
2818
- /** Reset repos in workspace to their checkout state. */
2850
+ /** Reset repos in workspace to their checkout state. Skips repos without path or source. */
2819
2851
  reset(repos: readonly RepoConfig[], workspacePath: string, reset: 'fast' | 'strict'): Promise<void>;
2820
2852
  }
2821
2853
 
@@ -3606,6 +3638,106 @@ interface DepsScanResult {
3606
3638
  */
3607
3639
  declare function scanRepoDeps(evalFilePaths: readonly string[]): Promise<DepsScanResult>;
3608
3640
 
3641
+ interface RepoCheckoutTarget {
3642
+ readonly path?: string;
3643
+ readonly ref: string;
3644
+ }
3645
+
3646
+ /**
3647
+ * Docker workspace provider — manages Docker container lifecycle for eval grading.
3648
+ *
3649
+ * Flow: pull image → create container → copy files in → exec grader → parse output → destroy container.
3650
+ * All Docker commands use `execFile` (no shell) for security.
3651
+ *
3652
+ * To add a new Docker command: add a method that calls `this.exec(...)` with the appropriate argv.
3653
+ *
3654
+ * Design decisions:
3655
+ * - CommandExecutor interface for testability (mock `execFile` in tests)
3656
+ * - Always `docker rm -f` in cleanup, even on errors (try/finally)
3657
+ * - Lazy-loaded: non-Docker evals never import this module
3658
+ */
3659
+
3660
+ /** Result of a command execution */
3661
+ interface ExecResult {
3662
+ readonly stdout: string;
3663
+ readonly stderr: string;
3664
+ readonly exitCode: number;
3665
+ }
3666
+ /** Abstraction over process execution for testability */
3667
+ interface CommandExecutor {
3668
+ exec(argv: readonly string[], options?: {
3669
+ timeoutMs?: number;
3670
+ stdin?: string;
3671
+ }): Promise<ExecResult>;
3672
+ }
3673
+ /** Options for creating a Docker container */
3674
+ interface CreateContainerOptions {
3675
+ readonly image: string;
3676
+ readonly memory?: string;
3677
+ readonly cpus?: number;
3678
+ }
3679
+ /** Options for executing a command inside a container */
3680
+ interface ExecInContainerOptions {
3681
+ readonly containerId: string;
3682
+ readonly command: readonly string[];
3683
+ readonly timeoutMs?: number;
3684
+ readonly stdin?: string;
3685
+ }
3686
+ /**
3687
+ * Manages Docker container lifecycle for workspace-based evaluations.
3688
+ *
3689
+ * Usage:
3690
+ * const docker = new DockerWorkspaceProvider(config);
3691
+ * await docker.pullImage();
3692
+ * const containerId = await docker.createContainer();
3693
+ * try {
3694
+ * await docker.copyToContainer(containerId, localPath, containerPath);
3695
+ * const output = await docker.execInContainer({ containerId, command: [...] });
3696
+ * // parse output...
3697
+ * } finally {
3698
+ * await docker.removeContainer(containerId);
3699
+ * }
3700
+ */
3701
+ declare class DockerWorkspaceProvider {
3702
+ private readonly config;
3703
+ private readonly executor;
3704
+ private readonly timeoutMs;
3705
+ constructor(config: DockerWorkspaceConfig, executor?: CommandExecutor);
3706
+ /** Check whether the Docker CLI is available on the host. */
3707
+ isDockerAvailable(): Promise<boolean>;
3708
+ /** Pull the configured Docker image. No-op if already cached locally. */
3709
+ pullImage(): Promise<void>;
3710
+ /** Create a stopped container from the configured image with resource limits. Returns container ID. */
3711
+ createContainer(): Promise<string>;
3712
+ /** Start a previously created container. */
3713
+ startContainer(containerId: string): Promise<void>;
3714
+ /**
3715
+ * Reset the container checkout to the specified target refs, if any.
3716
+ * This is used for SWE-bench images where the repo state must match the
3717
+ * dataset's base snapshot before grading begins.
3718
+ */
3719
+ resetContainerCheckout(containerId: string, repoCheckouts?: readonly RepoCheckoutTarget[]): Promise<void>;
3720
+ /** Copy a local file or directory into a running container. */
3721
+ copyToContainer(containerId: string, localPath: string, containerPath: string): Promise<void>;
3722
+ /**
3723
+ * Execute a command inside a running container.
3724
+ * If stdin is provided, it is piped via `docker exec -i`.
3725
+ */
3726
+ execInContainer(options: ExecInContainerOptions): Promise<ExecResult>;
3727
+ /** Force-remove a container (always succeeds, even if container doesn't exist). */
3728
+ removeContainer(containerId: string): Promise<void>;
3729
+ /** Full lifecycle: create → start → exec → cleanup. Convenience for single-command grading. */
3730
+ runGraderInContainer(options: {
3731
+ readonly command: readonly string[];
3732
+ readonly stdin?: string;
3733
+ readonly copyFiles?: ReadonlyArray<{
3734
+ localPath: string;
3735
+ containerPath: string;
3736
+ }>;
3737
+ readonly repoCheckouts?: readonly RepoCheckoutTarget[];
3738
+ }): Promise<ExecResult>;
3739
+ }
3740
+
3609
3741
  /**
3610
3742
  * File-based LLM response cache.
3611
3743
  * Stores provider responses as JSON files keyed by SHA-256 hash.
@@ -3662,6 +3794,60 @@ declare function toSnakeCaseDeep(obj: unknown): unknown;
3662
3794
  */
3663
3795
  declare function toCamelCaseDeep(obj: unknown): unknown;
3664
3796
 
3797
+ interface ResultsRepoCachePaths {
3798
+ readonly rootDir: string;
3799
+ readonly repoDir: string;
3800
+ readonly statusFile: string;
3801
+ }
3802
+ interface ResultsRepoStatus {
3803
+ readonly configured: boolean;
3804
+ readonly available: boolean;
3805
+ readonly repo?: string;
3806
+ readonly path?: string;
3807
+ readonly auto_push?: boolean;
3808
+ readonly branch_prefix?: string;
3809
+ readonly cache_dir?: string;
3810
+ readonly last_synced_at?: string;
3811
+ readonly last_error?: string;
3812
+ }
3813
+ interface CheckedOutResultsRepoBranch {
3814
+ readonly branchName: string;
3815
+ readonly baseBranch: string;
3816
+ readonly repoDir: string;
3817
+ }
3818
+ interface PreparedResultsRepoBranch extends CheckedOutResultsRepoBranch {
3819
+ readonly cleanup: () => Promise<void>;
3820
+ }
3821
+ declare function normalizeResultsExportConfig(config: ResultsExportConfig): Required<ResultsExportConfig>;
3822
+ declare function resolveResultsRepoUrl(repo: string): string;
3823
+ declare function getResultsRepoCachePaths(repo: string): ResultsRepoCachePaths;
3824
+ declare function ensureResultsRepoClone(config: ResultsExportConfig): Promise<string>;
3825
+ declare function getResultsRepoStatus(config?: ResultsExportConfig): ResultsRepoStatus;
3826
+ declare function syncResultsRepo(config: ResultsExportConfig): Promise<ResultsRepoStatus>;
3827
+ declare function checkoutResultsRepoBranch(config: ResultsExportConfig, branchName: string): Promise<CheckedOutResultsRepoBranch>;
3828
+ declare function prepareResultsRepoBranch(config: ResultsExportConfig, branchName: string): Promise<PreparedResultsRepoBranch>;
3829
+ declare function stageResultsArtifacts(params: {
3830
+ readonly repoDir: string;
3831
+ readonly sourceDir: string;
3832
+ readonly destinationDir: string;
3833
+ }): Promise<void>;
3834
+ declare function resolveResultsRepoRunsDir(config: ResultsExportConfig): string;
3835
+ declare function directorySizeBytes(targetPath: string): Promise<number>;
3836
+ declare function commitAndPushResultsBranch(params: {
3837
+ readonly repoDir: string;
3838
+ readonly branchName: string;
3839
+ readonly commitMessage: string;
3840
+ }): Promise<boolean>;
3841
+ declare function pushResultsRepoBranch(config: ResultsExportConfig, branchName: string, cwd?: string): Promise<void>;
3842
+ declare function createDraftResultsPr(params: {
3843
+ readonly repo: string;
3844
+ readonly repoDir: string;
3845
+ readonly baseBranch: string;
3846
+ readonly branchName: string;
3847
+ readonly title: string;
3848
+ readonly body: string;
3849
+ }): Promise<string>;
3850
+
3665
3851
  declare function getAgentvHome(): string;
3666
3852
  declare function getWorkspacesRoot(): string;
3667
3853
  declare function getSubagentsRoot(): string;
@@ -4186,4 +4372,4 @@ type AgentKernel = {
4186
4372
  };
4187
4373
  declare function createAgentKernel(): AgentKernel;
4188
4374
 
4189
- export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DepsScanResult, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, discoverAssertions, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
4375
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AgentVConfig$1 as AgentVYamlConfig, type AnthropicResolvedConfig, type ApiFormat, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, COMMON_TARGET_SETTINGS, type CacheConfig, type CheckedOutResultsRepoBranch, type ChildEvaluatorResult, type ClaudeDiscoverOptions, type ClaudeResolvedConfig, type ClaudeSession, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexDiscoverOptions, type CodexSession, type CommandExecutor, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type Content, type ContentFile, type ContentImage, type ContentPreprocessorConfig, type ContentText, type CopilotCliResolvedConfig, type DiscoverOptions as CopilotDiscoverOptions, type CopilotLogResolvedConfig, type CopilotSdkResolvedConfig, type CopilotSession, type CopilotSessionMeta, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, type CreateContainerOptions, DEFAULT_CATEGORY, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DEFAULT_THRESHOLD, type DepsScanResult, DeterministicAssertionEvaluator, type DockerWorkspaceConfig, DockerWorkspaceProvider, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecInContainerOptions, type ExecResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, PASS_THRESHOLD, type ParsedCopilotSession, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type PreparedResultsRepoBranch, type ProgressEvent, type ProjectEntry, type ProjectRegistry, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, type RepoDep, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type ResultsExportConfig, type ResultsRepoCachePaths, type ResultsRepoStatus, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranscriptEntry, type TranscriptJsonLine, TranscriptProvider, type TranscriptSource, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, addProject, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, checkoutResultsRepoBranch, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, commitAndPushResultsBranch, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createDraftResultsPr, createProvider, createTempWorkspace, deepEqual, defineConfig, deriveCategory, deriveProjectId, detectFormat, directorySizeBytes, discoverAssertions, discoverClaudeSessions, discoverCodexSessions, discoverCopilotSessions, discoverGraders, discoverGraders as discoverJudges, discoverProjects, discoverProviders, ensureResultsRepoClone, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractImageBlocks, extractJsonBlob, extractLastAssistantContent, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getProject, getProjectsRegistryPath, getResultsRepoCachePaths, getResultsRepoStatus, getSubagentsRoot, getTextContent, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isContent, isContentArray, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadProjectRegistry, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, normalizeResultsExportConfig, parseAgentSkillsEvals, parseClaudeSession, parseCodexSession, parseCopilotEvents, parseJsonFromText, parseJsonSafe, prepareResultsRepoBranch, pushResultsRepoBranch, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, readTranscriptFile, readTranscriptJsonl, removeProject, resolveAndCreateProvider, resolveDelegatedTargetDefinition, resolveFileReference, resolveResultsRepoRunsDir, resolveResultsRepoUrl, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, saveProjectRegistry, scanRepoDeps, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, stageResultsArtifacts, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, syncResultsRepo, toCamelCaseDeep, toSnakeCaseDeep, toTranscriptJsonLine, tokensPerTool, touchProject, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };