@agentv/core 2.15.0 → 2.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -514,9 +514,28 @@ type RepoConfig = {
514
514
  readonly checkout?: RepoCheckout;
515
515
  readonly clone?: RepoClone;
516
516
  };
517
- type ResetConfig = {
518
- readonly strategy?: 'none' | 'hard' | 'recreate';
519
- readonly after_each?: boolean;
517
+ type WorkspaceHookConfig = {
518
+ /** Optional command array to execute (e.g., ["bun", "run", "setup.ts"]) */
519
+ readonly command?: readonly string[];
520
+ /** @deprecated Use `command` instead */
521
+ readonly script?: readonly string[];
522
+ /** Optional timeout in milliseconds */
523
+ readonly timeout_ms?: number;
524
+ readonly timeoutMs?: number;
525
+ /** Optional working directory for command execution */
526
+ readonly cwd?: string;
527
+ /** Optional reset policy for this hook */
528
+ readonly reset?: 'none' | 'fast' | 'strict';
529
+ };
530
+ type WorkspaceHooksConfig = {
531
+ /** Runs once before first test in the workspace lifecycle */
532
+ readonly before_all?: WorkspaceHookConfig;
533
+ /** Runs before each test case */
534
+ readonly before_each?: WorkspaceHookConfig;
535
+ /** Runs after each test case */
536
+ readonly after_each?: WorkspaceHookConfig;
537
+ /** Runs once after final test in the workspace lifecycle */
538
+ readonly after_all?: WorkspaceHookConfig;
520
539
  };
521
540
  type WorkspaceConfig = {
522
541
  /** Template directory or .code-workspace file. Directories are copied to temp workspace.
@@ -526,16 +545,14 @@ type WorkspaceConfig = {
526
545
  readonly isolation?: 'shared' | 'per_test';
527
546
  /** Repository definitions to clone/checkout into workspace */
528
547
  readonly repos?: readonly RepoConfig[];
529
- /** Reset configuration for repos between test runs */
530
- readonly reset?: ResetConfig;
531
- /** Command to run once before first test (after workspace creation, before git baseline) */
532
- readonly before_all?: WorkspaceScriptConfig;
533
- /** Command to run once after last test (before workspace cleanup) */
534
- readonly after_all?: WorkspaceScriptConfig;
535
- /** Command to run before each test */
536
- readonly before_each?: WorkspaceScriptConfig;
537
- /** Command to run after each test (e.g., git reset for workspace reuse) */
538
- readonly after_each?: WorkspaceScriptConfig;
548
+ /** Workspace lifecycle hooks */
549
+ readonly hooks?: WorkspaceHooksConfig;
550
+ /** Workspace materialization mode */
551
+ readonly mode?: 'pooled' | 'ephemeral' | 'static';
552
+ /** Required when mode=static: use this existing directory directly */
553
+ readonly static_path?: string;
554
+ /** @deprecated Use mode=pooled|ephemeral|static */
555
+ readonly pool?: boolean;
539
556
  };
540
557
  type CodeEvaluatorConfig = {
541
558
  readonly name: string;
@@ -2347,34 +2364,18 @@ declare class DeterministicAssertionEvaluator implements Evaluator {
2347
2364
  }
2348
2365
 
2349
2366
  declare class RepoManager {
2350
- private readonly cacheDir;
2351
2367
  private readonly verbose;
2352
- constructor(cacheDir?: string, verbose?: boolean);
2368
+ constructor(verbose?: boolean);
2353
2369
  private runGit;
2354
2370
  /**
2355
- * Ensure a bare mirror cache exists for the given source.
2356
- * Creates on first access, fetches updates on subsequent calls.
2357
- * Returns the absolute path to the cache directory.
2358
- */
2359
- ensureCache(source: RepoSource, depth?: number, resolve?: 'remote' | 'local'): Promise<string>;
2360
- /**
2361
- * Clone a repo from cache into the workspace at the configured path.
2371
+ * Clone a repo directly from source into the workspace at the configured path.
2362
2372
  * Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
2363
2373
  */
2364
2374
  materialize(repo: RepoConfig, workspacePath: string): Promise<void>;
2365
2375
  /** Materialize all repos into the workspace. */
2366
2376
  materializeAll(repos: readonly RepoConfig[], workspacePath: string): Promise<void>;
2367
2377
  /** Reset repos in workspace to their checkout state. */
2368
- reset(repos: readonly RepoConfig[], workspacePath: string, strategy: 'hard' | 'recreate'): Promise<void>;
2369
- /**
2370
- * Seed the cache from a local repository, setting the remote to a given URL.
2371
- * Useful for avoiding slow network clones when a local clone already exists.
2372
- */
2373
- seedCache(localPath: string, remoteUrl: string, opts?: {
2374
- force?: boolean;
2375
- }): Promise<string>;
2376
- /** Remove the entire cache directory. */
2377
- cleanCache(): Promise<void>;
2378
+ reset(repos: readonly RepoConfig[], workspacePath: string, reset: 'fast' | 'strict'): Promise<void>;
2378
2379
  }
2379
2380
 
2380
2381
  type MaybePromise<T> = T | Promise<T>;
@@ -2406,6 +2407,10 @@ interface RunEvalCaseOptions {
2406
2407
  readonly keepWorkspaces?: boolean;
2407
2408
  /** Force cleanup of workspaces even on failure */
2408
2409
  readonly cleanupWorkspaces?: boolean;
2410
+ /** Retention policy for temp workspaces on successful cases */
2411
+ readonly retainOnSuccess?: 'keep' | 'cleanup';
2412
+ /** Retention policy for temp workspaces on failed cases */
2413
+ readonly retainOnFailure?: 'keep' | 'cleanup';
2409
2414
  /** Pre-created shared workspace path (shared across tests in a suite) */
2410
2415
  readonly sharedWorkspacePath?: string;
2411
2416
  /** Pre-initialized baseline commit for shared workspace */
@@ -2461,12 +2466,22 @@ interface RunEvaluationOptions {
2461
2466
  readonly totalBudgetUsd?: number;
2462
2467
  /** Execution error tolerance: true halts on first error */
2463
2468
  readonly failOnError?: FailOnError;
2464
- /** Opt-in: reuse materialized workspaces across eval runs */
2469
+ /** Workspace pooling: true (default) enables pool, false disables, undefined defaults to true */
2465
2470
  readonly poolWorkspaces?: boolean;
2466
2471
  /** Maximum number of pool slots on disk (default: 10, max: 50) */
2467
2472
  readonly poolMaxSlots?: number;
2468
2473
  /** Pre-existing workspace directory to use directly (skips clone/copy/pool) */
2469
2474
  readonly workspace?: string;
2475
+ /** Workspace materialization mode override */
2476
+ readonly workspaceMode?: 'pooled' | 'ephemeral' | 'static';
2477
+ /** Static workspace path override (used when workspaceMode=static) */
2478
+ readonly workspacePath?: string;
2479
+ /** Workspace clean policy override for pooled reset */
2480
+ readonly workspaceClean?: 'standard' | 'full';
2481
+ /** Retention policy override for successful cases */
2482
+ readonly retainOnSuccess?: 'keep' | 'cleanup';
2483
+ /** Retention policy override for failed cases */
2484
+ readonly retainOnFailure?: 'keep' | 'cleanup';
2470
2485
  }
2471
2486
  declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
2472
2487
  declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
@@ -2978,6 +2993,7 @@ interface AcquireWorkspaceOptions {
2978
2993
  repos: readonly RepoConfig[];
2979
2994
  maxSlots: number;
2980
2995
  repoManager: RepoManager;
2996
+ poolReset?: 'none' | 'fast' | 'strict';
2981
2997
  }
2982
2998
  interface PoolSlot {
2983
2999
  readonly index: number;
@@ -3106,7 +3122,6 @@ declare function toCamelCaseDeep(obj: unknown): unknown;
3106
3122
 
3107
3123
  declare function getAgentvHome(): string;
3108
3124
  declare function getWorkspacesRoot(): string;
3109
- declare function getGitCacheRoot(): string;
3110
3125
  declare function getSubagentsRoot(): string;
3111
3126
  declare function getTraceStateRoot(): string;
3112
3127
  declare function getWorkspacePoolRoot(): string;
@@ -3265,4 +3280,4 @@ type AgentKernel = {
3265
3280
  };
3266
3281
  declare function createAgentKernel(): AgentKernel;
3267
3282
 
3268
- export { type AcquireWorkspaceOptions, AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getGitCacheRoot, getHitCount, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
3283
+ export { type AcquireWorkspaceOptions, AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getHitCount, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
package/dist/index.d.ts CHANGED
@@ -514,9 +514,28 @@ type RepoConfig = {
514
514
  readonly checkout?: RepoCheckout;
515
515
  readonly clone?: RepoClone;
516
516
  };
517
- type ResetConfig = {
518
- readonly strategy?: 'none' | 'hard' | 'recreate';
519
- readonly after_each?: boolean;
517
+ type WorkspaceHookConfig = {
518
+ /** Optional command array to execute (e.g., ["bun", "run", "setup.ts"]) */
519
+ readonly command?: readonly string[];
520
+ /** @deprecated Use `command` instead */
521
+ readonly script?: readonly string[];
522
+ /** Optional timeout in milliseconds */
523
+ readonly timeout_ms?: number;
524
+ readonly timeoutMs?: number;
525
+ /** Optional working directory for command execution */
526
+ readonly cwd?: string;
527
+ /** Optional reset policy for this hook */
528
+ readonly reset?: 'none' | 'fast' | 'strict';
529
+ };
530
+ type WorkspaceHooksConfig = {
531
+ /** Runs once before first test in the workspace lifecycle */
532
+ readonly before_all?: WorkspaceHookConfig;
533
+ /** Runs before each test case */
534
+ readonly before_each?: WorkspaceHookConfig;
535
+ /** Runs after each test case */
536
+ readonly after_each?: WorkspaceHookConfig;
537
+ /** Runs once after final test in the workspace lifecycle */
538
+ readonly after_all?: WorkspaceHookConfig;
520
539
  };
521
540
  type WorkspaceConfig = {
522
541
  /** Template directory or .code-workspace file. Directories are copied to temp workspace.
@@ -526,16 +545,14 @@ type WorkspaceConfig = {
526
545
  readonly isolation?: 'shared' | 'per_test';
527
546
  /** Repository definitions to clone/checkout into workspace */
528
547
  readonly repos?: readonly RepoConfig[];
529
- /** Reset configuration for repos between test runs */
530
- readonly reset?: ResetConfig;
531
- /** Command to run once before first test (after workspace creation, before git baseline) */
532
- readonly before_all?: WorkspaceScriptConfig;
533
- /** Command to run once after last test (before workspace cleanup) */
534
- readonly after_all?: WorkspaceScriptConfig;
535
- /** Command to run before each test */
536
- readonly before_each?: WorkspaceScriptConfig;
537
- /** Command to run after each test (e.g., git reset for workspace reuse) */
538
- readonly after_each?: WorkspaceScriptConfig;
548
+ /** Workspace lifecycle hooks */
549
+ readonly hooks?: WorkspaceHooksConfig;
550
+ /** Workspace materialization mode */
551
+ readonly mode?: 'pooled' | 'ephemeral' | 'static';
552
+ /** Required when mode=static: use this existing directory directly */
553
+ readonly static_path?: string;
554
+ /** @deprecated Use mode=pooled|ephemeral|static */
555
+ readonly pool?: boolean;
539
556
  };
540
557
  type CodeEvaluatorConfig = {
541
558
  readonly name: string;
@@ -2347,34 +2364,18 @@ declare class DeterministicAssertionEvaluator implements Evaluator {
2347
2364
  }
2348
2365
 
2349
2366
  declare class RepoManager {
2350
- private readonly cacheDir;
2351
2367
  private readonly verbose;
2352
- constructor(cacheDir?: string, verbose?: boolean);
2368
+ constructor(verbose?: boolean);
2353
2369
  private runGit;
2354
2370
  /**
2355
- * Ensure a bare mirror cache exists for the given source.
2356
- * Creates on first access, fetches updates on subsequent calls.
2357
- * Returns the absolute path to the cache directory.
2358
- */
2359
- ensureCache(source: RepoSource, depth?: number, resolve?: 'remote' | 'local'): Promise<string>;
2360
- /**
2361
- * Clone a repo from cache into the workspace at the configured path.
2371
+ * Clone a repo directly from source into the workspace at the configured path.
2362
2372
  * Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
2363
2373
  */
2364
2374
  materialize(repo: RepoConfig, workspacePath: string): Promise<void>;
2365
2375
  /** Materialize all repos into the workspace. */
2366
2376
  materializeAll(repos: readonly RepoConfig[], workspacePath: string): Promise<void>;
2367
2377
  /** Reset repos in workspace to their checkout state. */
2368
- reset(repos: readonly RepoConfig[], workspacePath: string, strategy: 'hard' | 'recreate'): Promise<void>;
2369
- /**
2370
- * Seed the cache from a local repository, setting the remote to a given URL.
2371
- * Useful for avoiding slow network clones when a local clone already exists.
2372
- */
2373
- seedCache(localPath: string, remoteUrl: string, opts?: {
2374
- force?: boolean;
2375
- }): Promise<string>;
2376
- /** Remove the entire cache directory. */
2377
- cleanCache(): Promise<void>;
2378
+ reset(repos: readonly RepoConfig[], workspacePath: string, reset: 'fast' | 'strict'): Promise<void>;
2378
2379
  }
2379
2380
 
2380
2381
  type MaybePromise<T> = T | Promise<T>;
@@ -2406,6 +2407,10 @@ interface RunEvalCaseOptions {
2406
2407
  readonly keepWorkspaces?: boolean;
2407
2408
  /** Force cleanup of workspaces even on failure */
2408
2409
  readonly cleanupWorkspaces?: boolean;
2410
+ /** Retention policy for temp workspaces on successful cases */
2411
+ readonly retainOnSuccess?: 'keep' | 'cleanup';
2412
+ /** Retention policy for temp workspaces on failed cases */
2413
+ readonly retainOnFailure?: 'keep' | 'cleanup';
2409
2414
  /** Pre-created shared workspace path (shared across tests in a suite) */
2410
2415
  readonly sharedWorkspacePath?: string;
2411
2416
  /** Pre-initialized baseline commit for shared workspace */
@@ -2461,12 +2466,22 @@ interface RunEvaluationOptions {
2461
2466
  readonly totalBudgetUsd?: number;
2462
2467
  /** Execution error tolerance: true halts on first error */
2463
2468
  readonly failOnError?: FailOnError;
2464
- /** Opt-in: reuse materialized workspaces across eval runs */
2469
+ /** Workspace pooling: true (default) enables pool, false disables, undefined defaults to true */
2465
2470
  readonly poolWorkspaces?: boolean;
2466
2471
  /** Maximum number of pool slots on disk (default: 10, max: 50) */
2467
2472
  readonly poolMaxSlots?: number;
2468
2473
  /** Pre-existing workspace directory to use directly (skips clone/copy/pool) */
2469
2474
  readonly workspace?: string;
2475
+ /** Workspace materialization mode override */
2476
+ readonly workspaceMode?: 'pooled' | 'ephemeral' | 'static';
2477
+ /** Static workspace path override (used when workspaceMode=static) */
2478
+ readonly workspacePath?: string;
2479
+ /** Workspace clean policy override for pooled reset */
2480
+ readonly workspaceClean?: 'standard' | 'full';
2481
+ /** Retention policy override for successful cases */
2482
+ readonly retainOnSuccess?: 'keep' | 'cleanup';
2483
+ /** Retention policy override for failed cases */
2484
+ readonly retainOnFailure?: 'keep' | 'cleanup';
2470
2485
  }
2471
2486
  declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
2472
2487
  declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
@@ -2978,6 +2993,7 @@ interface AcquireWorkspaceOptions {
2978
2993
  repos: readonly RepoConfig[];
2979
2994
  maxSlots: number;
2980
2995
  repoManager: RepoManager;
2996
+ poolReset?: 'none' | 'fast' | 'strict';
2981
2997
  }
2982
2998
  interface PoolSlot {
2983
2999
  readonly index: number;
@@ -3106,7 +3122,6 @@ declare function toCamelCaseDeep(obj: unknown): unknown;
3106
3122
 
3107
3123
  declare function getAgentvHome(): string;
3108
3124
  declare function getWorkspacesRoot(): string;
3109
- declare function getGitCacheRoot(): string;
3110
3125
  declare function getSubagentsRoot(): string;
3111
3126
  declare function getTraceStateRoot(): string;
3112
3127
  declare function getWorkspacePoolRoot(): string;
@@ -3265,4 +3280,4 @@ type AgentKernel = {
3265
3280
  };
3266
3281
  declare function createAgentKernel(): AgentKernel;
3267
3282
 
3268
- export { type AcquireWorkspaceOptions, AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getGitCacheRoot, getHitCount, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
3283
+ export { type AcquireWorkspaceOptions, AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getHitCount, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };